def _find_revision(self, revision): please_stop = False locker = Lock() output = [] queue = Queue("branches", max=2000) queue.extend(b for b in self.branches if b.locale == DEFAULT_LOCALE and b.name in ["try", "mozilla-inbound", "autoland"]) queue.add(THREAD_STOP) problems = [] def _find(please_stop): for b in queue: if please_stop: return try: url = b.url + "json-info?node=" + revision rev = self.get_revision(Revision(branch=b, changeset={"id": revision})) with locker: output.append(rev) Log.note("Revision found at {{url}}", url=url) except Exception as f: problems.append(f) threads = [] for i in range(3): threads.append(Thread.run("find changeset " + text_type(i), _find, please_stop=please_stop)) for t in threads: with assert_no_exception: t.join() return output
def update_local_database(config, deviant_summary, candidates, since): if isinstance(deviant_summary, bigquery.Table): Log.note("Only the ETL process should fill the bigquery table") return # GET EVERYTHING WE HAVE SO FAR exists = deviant_summary.query({ "select": ["signature_hash", "last_updated"], "where": { "and": [ { "in": { "signature_hash": candidates.signature_hash } }, { "exists": "num_pushes" }, ] }, "sort": "last_updated", "limit": 100000, "format": "list", }).data # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT" missing = list(set(candidates.signature_hash) - set(exists.signature_hash)) too_old = Date.today() - parse(LOCAL_RETENTION) needs_update = missing + [ e.signature_hash for e in exists if e.last_updated < too_old.unix ] Log.alert("{{num}} series are candidates for local update", num=len(needs_update)) limited_update = Queue("sigs") limited_update.extend( left(needs_update, coalesce(config.display.download_limit, 100))) Log.alert("Updating local database with {{num}} series", num=len(limited_update)) with Timer("Updating local database"): def loop(please_stop): while not please_stop: signature_hash = limited_update.pop_one() if not signature_hash: return process( signature_hash, since, source=config.database, deviant_summary=deviant_summary, ) threads = [Thread.run(text(i), loop) for i in range(3)] for t in threads: t.join() Log.note("Local database is up to date")
def update_local_database(): # GET EVERYTHING WE HAVE SO FAR exists = summary_table.query({ "select": ["id", "last_updated"], "where": { "and": [{ "in": { "id": candidates.id } }, { "exists": "num_pushes" }] }, "sort": "last_updated", "limit": 100000, "format": "list", }).data # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT" missing = list(set(candidates.id) - set(exists.id)) too_old = Date.today() - parse(LOCAL_RETENTION) needs_update = missing + [ e for e in exists if e.last_updated < too_old.unix ] Log.alert("{{num}} series are candidates for local update", num=len(needs_update)) limited_update = Queue("sigs") limited_update.extend( left(needs_update, coalesce(config.analysis.download_limit, 100))) Log.alert("Updating local database with {{num}} series", num=len(limited_update)) with Timer("Updating local database"): def loop(please_stop): while not please_stop: sig_id = limited_update.pop_one() if not sig_id: return process(sig_id) threads = [Thread.run(text(i), loop) for i in range(3)] for t in threads: t.join() Log.note("Local database is up to date")
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): if jx_base_meta.singlton: return jx_base_meta.singlton else: jx_base_meta.singlton = object.__new__(cls) return jx_base_meta.singlton @override def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.abs_columns = set() self.last_es_metadata = Date.now() - OLD_METADATA self.meta = Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer( "meta.tables", [], wrap({c.names["."]: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return wrap( [t for t in self.meta.tables.data if t.name == table_name]) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = self.meta.columns.find(c.es_index, c.names["."]) for canonical in existing_columns: if canonical.type == c.type and canonical is not c: set_default(c.names, canonical.names) for key in Column.__slots__: canonical[key] = c[key] if DEBUG: Log.note("todo: {{table}}::{{column}}", table=canonical.es_index, column=canonical.es_column) self.todo.add(canonical) break else: self.meta.columns.add(c) self.todo.add(c) if ENABLE_META_SCAN: if DEBUG: Log.note("todo: {{table}}::{{column}}", table=c.es_index, column=c.es_column) # MARK meta.columns AS DIRTY TOO cols = self.meta.columns.find("meta.columns", None) for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() - TOO_OLD self.todo.extend(cols) def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE table_path = split_field(table) es_index = table_path[0] meta = self.es_metadata.indices[es_index] if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: self.es_metadata = self.default_es.get_metadata(force=True) meta = self.es_metadata.indices[es_index] for data_type, properties in meta.mappings.items(): if data_type == "_default_": continue properties.properties["_id"] = { "type": "string", "index": "not_analyzed" } self._parse_properties(meta.index, properties, meta) def _parse_properties(self, abs_index, properties, meta): # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES def add_column(c, query_path): c.last_updated = Date.now() - TOO_OLD if query_path[0] != ".": c.names[query_path[0]] = relative_field( c.names["."], query_path[0]) with self.meta.columns.locker: for alias in meta.aliases: c_ = copy(c) c_.es_index = alias self._upsert_column(c_) self._upsert_column(c) abs_columns = elasticsearch.parse_properties(abs_index, None, properties.properties) self.abs_columns.update(abs_columns) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(SELF_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: abs_column = abs_column.__copy__() abs_column.type = es_type_to_json_type[abs_column.type] for query_path in query_paths: add_column(abs_column, query_path) pass def query(self, _query): return self.meta.columns.query( QueryOp( set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__()))) def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) es_index_name = table_path[0] query_path = join_field(table_path[1:]) table = self.get_table(es_index_name)[0] abs_column_name = None if column_name == None else concat_field( query_path, column_name) try: # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = Table(name=es_index_name, url=None, query_path=['.'], timestamp=Date.now()) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=es_index_name) elif force or table.timestamp == None or table.timestamp < Date.now( ) - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=es_index_name) with self.meta.columns.locker: columns = self.meta.columns.find(es_index_name, column_name) if columns: columns = jx.sort(columns, "names.\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note( "waiting for columns to update {{columns|json}}", columns=[ c.es_index + "." + c.es_column for c in columns if not c.last_updated ]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) if abs_column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name) else: self._get_columns(table=table_name) # TO TEST WHAT HAPPENED Log.error("no columns for {{table}}?!", table=table_name) def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if column.es_index in self.index_does_not_exist: return if column.type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby( self.meta.columns, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return if column.es_index == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby( self.meta.tables, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return es_index = column.es_index.split(".")[0] is_text = [ cc for cc in self.abs_columns if cc.es_column == column.es_column and cc.type == "text" ] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": { "count": { "filter": { "match_all": {} } } }, "size": 0 }) count = result.hits.total cardinality = 1001 multi = 1001 elif column.es_column == "_id": result = self.default_es.post("/" + es_index + "/_search", data={ "query": { "match_all": {} }, "size": 0 }) count = cardinality = result.hits.total multi = 1 else: result = self.default_es.post( "/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": { "max": { "script": "doc[" + quote(column.es_column) + "].values.size()" } } }, "size": 0 }) r = result.aggregations.count count = result.hits.total cardinality = coalesce(r.value, r._nested.value, r.doc_count) multi = coalesce(r.multi.value, 1) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): if DEBUG: Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif column.type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: if DEBUG: Log.note("{{field}} has {{num}} parts", field=column.es_index, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": { "path": column.nested_path[0] }, "aggs": { "_nested": { "terms": { "field": column.es_column } } } } elif cardinality == 0: query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = { "terms": { "field": column.es_column, "size": cardinality } } result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) if DEBUG: Log.note("{{field}} has {{parts}}", field=column.names["."], parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE TEST_TABLE = "testdata" is_missing_index = any( w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = any( column.es_index.startswith(t) for t in [TEST_TABLE_PREFIX, TEST_TABLE]) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR with self.meta.columns.locker: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "names.\\.": ".", "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: with self.meta.columns.locker: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now() - TOO_OLD) and c.type not in STRUCT ] if old_columns: if DEBUG: Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[ Date(t).format() for t in wrap(old_columns).last_updated ]) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: if DEBUG: Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds)) if DEBUG: Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column) if column: if column.es_index in self.index_does_not_exist: with self.meta.columns.locker: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) continue if column.type in STRUCT or column.es_column.endswith( "." + EXISTS_TYPE): with self.meta.columns.locker: column.last_updated = Date.now() continue elif column.last_updated >= Date.now() - TOO_OLD: continue try: self._update_cardinality(column) if DEBUG and not column.es_index.startswith( TEST_TABLE_PREFIX): Log.note("updated {{column.name}}", column=column) except Exception as e: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: c = self.todo.pop() if c == THREAD_STOP: break if not c.last_updated or c.last_updated >= Date.now() - TOO_OLD: continue with self.meta.columns.locker: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) if DEBUG: Log.note( "Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
class ElasticsearchMetadata(Namespace): """ MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER """ def __new__(cls, *args, **kwargs): if jx_base_meta.singlton: return jx_base_meta.singlton else: jx_base_meta.singlton = object.__new__(cls) return jx_base_meta.singlton @override def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = Relation_usingList() self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList() self.alias_to_query_paths = { "meta.columns": [['.']], "meta.tables": [['.']] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer("meta.tables", [], jx_base.Schema(".", table_columns)) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def url(self): return self.es_cluster.path + "/" + self.default_name.replace(".", "/") def _reload_columns(self, table_desc): """ :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS) :return: """ # FIND ALL INDEXES OF ALIAS es_last_updated = self.es_cluster.metatdata_last_updated alias = table_desc.name canonical_index = self.es_cluster.get_best_matching_index(alias).index update_required = not (table_desc.timestamp < es_last_updated) metadata = self.es_cluster.get_metadata(force=update_required) indexes = self.index_to_alias.get_domain(alias) props = [(self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties) for i, d in metadata.indices.items() if i in indexes for t, m in [_get_best_type_from_mapping(d.mappings)]] # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT dirty = False all_comparisions = list(jx.pairwise(props)) + list( jx.pairwise(jx.reverse(props))) # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE for (i1, t1, p1), (i2, t2, p2) in all_comparisions: diff = elasticsearch.diff_schema(p2, p1) if not self.settings.read_only: for d in diff: dirty = True i1.add_property(*d) meta = self.es_cluster.get_metadata( force=dirty).indices[canonical_index] data_type, mapping = _get_best_type_from_mapping(meta.mappings) mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"} self._parse_properties(alias, mapping, meta) table_desc.timestamp = es_last_updated def _parse_properties(self, alias, mapping, meta): abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(SELF_PATH) query_paths.append(ROOT_PATH) self.alias_to_query_paths[alias] = query_paths # ADD RELATIVE NAMES for abs_column in abs_columns: abs_column.last_updated = None abs_column.jx_type = es_type_to_json_type[abs_column.es_type] for query_path in query_paths: abs_column.names[query_path[0]] = relative_field( abs_column.names["."], query_path[0]) self.todo.add(self.meta.columns.add(abs_column)) pass def query(self, _query): return self.meta.columns.query( QueryOp( set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__()))) def _find_alias(self, name): if self.metadata_last_updated < self.es_cluster.metatdata_last_updated: for a in self.es_cluster.get_aliases(): self.index_to_alias[a.index] = coalesce(a.alias, a.index) self.alias_last_updated.setdefault(a.alias, Date.MIN) if name in self.alias_last_updated: return name else: return self.index_to_alias[name] def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: last_update = MAX([ self.es_cluster.index_last_updated[i] for i in self.index_to_alias.get_domain(alias) ]) table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc(name=alias, url=None, query_path=['.'], timestamp=Date.MIN) with self.meta.tables.locker: self.meta.tables.add(table) self._reload_columns(table) elif force or table.timestamp < last_update: self._reload_columns(table) columns = self.meta.columns.find(alias, column_name) columns = jx.sort(columns, "names.\\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: if len(columns) > 10: Log.note("waiting for {{num}} columns to update", num=len([ c for c in columns if not c.last_updated ])) else: Log.note( "waiting for columns to update {{columns|json}}", columns=[ c.es_index + "." + c.es_column for c in columns if not c.last_updated ]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) return [] def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return if column.es_index == "meta.tables": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return es_index = column.es_index.split(".")[0] is_text = [ cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text" ] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": { "filter": { "match_all": {} } } }, "size": 0 }) count = result.hits.total cardinality = 1001 multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": { "match_all": {} }, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 multi = 1 else: result = self.es_cluster.post( "/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": { "max": { "script": "doc[" + quote(column.es_column) + "].values.size()" } } }, "size": 0 }) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{field}} has {{num}} parts", field=column.es_index, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": { "path": column.nested_path[0] }, "aggs": { "_nested": { "terms": { "field": column.es_column } } } } elif cardinality == 0: query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = { "terms": { "field": column.es_column, "size": cardinality } } result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": Date.now() }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE TEST_TABLE = "testdata" is_missing_index = any( w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = any( column.es_index.startswith(t) for t in [TEST_TABLE_PREFIX, TEST_TABLE]) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "names.\\.": ".", "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now() - TOO_OLD) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[ Date(t).format() for t in wrap(old_columns).last_updated ]) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds)) if column: if column is THREAD_STOP: continue DEBUG and Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column) if column.es_index in self.index_does_not_exist: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) continue if column.jx_type in STRUCT or column.es_column.endswith( "." + EXISTS_TYPE): column.last_updated = Date.now() continue elif column.last_updated >= Date.now() - TOO_OLD: continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX) ) and Log.note("updated {{column.name}}", column=column) except Exception as e: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: c = self.todo.pop() if c == THREAD_STOP: break if c.last_updated >= Date.now() - TOO_OLD: continue self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "es_index": c.es_index, "es_column": c.es_column } } }) DEBUG and Log.note( "Did not get {{col.es_index}}.{{col.es_column}} info", col=c) def get_table(self, alias_name): with self.meta.tables.locker: return wrap( [t for t in self.meta.tables.data if t.name == alias_name]) def get_snowflake(self, fact_table_name): return Snowflake(fact_table_name, self) def get_schema(self, name): if name == "meta.columns": return self.meta.columns.schema query_path = split_field(name) root, rest = query_path[0], join_field(query_path[1:]) return self.get_snowflake(root).get_schema(rest)
class Table(BaseFacts): @override def __init__( self, table, typed, read_only, sharded, container, id=Null, partition=Null, cluster=Null, top_level_fields=Null, kwargs=None, ): self.short_name = table self.typed = typed self.read_only = read_only self.cluster = cluster self.id = id self.top_level_fields = top_level_fields self.config = Data( # USED TO REPLICATE THIS typed=typed, read_only=read_only, sharded=sharded, id=id, partition=partition, cluster=cluster, top_level_fields=top_level_fields, ) esc_name = escape_name(table) self.full_name = container.full_name + esc_name self.alias_view = alias_view = container.client.get_table(text(self.full_name)) self.partition = partition self.container = container if not sharded: if not read_only and alias_view.table_type == "VIEW": Log.error("Expecting a table, not a view") self.shard = alias_view self._flake = Snowflake.parse( alias_view.schema, text(self.full_name), self.top_level_fields, partition, ) else: if alias_view.table_type != "VIEW": Log.error("Sharded tables require a view") current_view = container.client.get_table(text(self.full_name)) view_sql = current_view.view_query shard_name = _extract_primary_shard_name(view_sql) try: self.shard = container.client.get_table( text(container.full_name + shard_name) ) self._flake = Snowflake.parse( alias_view.schema, text(self.full_name), self.top_level_fields, partition, ) except Exception as e: Log.warning("view {{name}} is invalid", name=shard_name, cause=e) self._flake = Snowflake.parse( alias_view.schema, text(self.full_name), self.top_level_fields, partition, ) # REMOVE STALE VIEW container.client.delete_table(current_view) # MAKE NEW VIEW POINTING TO NEW SHARD self._create_new_shard() container.create_view( self.full_name, self.container.full_name + ApiName(self.shard.table_id), ) self.last_extend = Date.now() - EXTEND_LIMIT self.extend_locker = Lock() self.extend_queue = Queue("wait for extend") def all_records(self): """ MOSTLY FOR TESTING, RETURN ALL RECORDS IN TABLE :return: """ return self.sql_query(sql_query({"from": text(self.full_name)}, self.schema)) def jx_query(self, jx_query): docs = self.sql_query( sql_query( dict_to_data({"from": text(self.full_name)}) | jx_query, self.schema ) ) data = [] for d in docs: u = untyped(from_data(leaves_to_data(d))) data.append(u) return Data(data=data, format="list") @property def schema(self): return self._flake def sql_query(self, sql): """ :param sql: SQL QUERY :return: GENERATOR OF DOCUMENTS as dict """ query_job = self.container.query_and_wait(sql) # WE WILL REACH INTO THE _flake, SINCE THIS IS THE FIRST PLACE WE ARE ACTUALLY PULLING RECORDS OUT # TODO: WITH MORE CODE THIS LOGIC GOES ELSEWHERE _ = self._flake.columns # ENSURE schema HAS BEEN PROCESSED if not self._flake._top_level_fields.keys(): for row in query_job: yield untyped(dict(row)) else: top2deep = { name: path for path, name in self._flake._top_level_fields.items() } for row in query_job: output = {} doc = dict(row) # COPY ALL BUT TOP LEVEL FIELDS for k, v in doc.items(): deep = top2deep.get(k) if deep is None: output[k] = v # INSERT TOP LEVEL FIELDS reach = wrap(output) for k, p in top2deep.items(): try: reach[p] = doc.get(k) except Exception as cause: raise cause yield untyped(output) @property def flake(self): return self._flake def _create_new_shard(self): primary_shard = self.container.create_table( table=self.short_name + "_" + "".join(randoms.sample(ALLOWED, 20)), sharded=False, schema=self._flake.schema, kwargs=self.config, ) self.shard = primary_shard.shard def extend(self, docs): self.extend_queue.extend(docs) with self.extend_locker: docs = self.extend_queue.pop_all() self._extend(docs) def _extend(self, rows): if self.read_only: Log.error("not for writing") if len(rows) == 0: return try: update = {} with Timer("encoding", verbose=DEBUG): while True: typed_rows = [] for rownum, row in enumerate(rows): typed_row, more, add_nested = typed_encode(row, self.flake) set_default(update, more) if add_nested: # row HAS NEW NESTED COLUMN! # GO OVER THE rows AGAIN SO "RECORD" GET MAPPED TO "REPEATED" DEBUG and Log.note("New nested documnet found, retrying") break typed_rows.append(typed_row) else: break if update or not self.shard: # BATCH HAS ADDITIONAL COLUMNS!! # WE CAN NOT USE THE EXISTING SHARD, MAKE A NEW ONE: self._create_new_shard() DEBUG and Log.note( "added new shard with name: {{shard}}", shard=self.shard.table_id ) with Timer( "insert {{num}} rows to bq", param={"num": len(rows)}, verbose=DEBUG ): failures = self.container.client.insert_rows_json( self.shard, json_rows=typed_rows, row_ids=[None] * len(typed_rows), skip_invalid_rows=False, ignore_unknown_values=False, ) if failures: if all(r == "stopped" for r in wrap(failures).errors.reason): self._create_new_shard() DEBUG and Log.note( "STOPPED encountered: Added new shard with name: {{shard}}", shard=self.shard.table_id, ) Log.error( "Got {{num}} failures:\n{{failures|json}}", num=len(failures), failures=failures[:5], ) else: self.last_extend = Date.now() DEBUG and Log.note("{{num}} rows added", num=len(typed_rows)) except Exception as cause: cause = Except.wrap(cause) if ( len(typed_rows) < 2 and "Your client has issued a malformed or illegal request." in cause ): Log.error( "big query complains about:\n{{data|json}}", data=typed_rows, cause=cause, ) elif len(rows) > 1 and ( "Request payload size exceeds the limit" in cause or "An existing connection was forcibly closed by the remote host" in cause or "Your client has issued a malformed or illegal request." in cause or "BrokenPipeError(32, 'Broken pipe')" in cause or "ConnectionResetError(104, 'Connection reset by peer')" in cause ): Log.warning( "problem with batch of size {{size}}", size=len(rows), cause=cause ) batch_size = ceiling(len(rows) / 10) try: DEBUG and Log.note( "attempt smaller batches of size {{batch_size}}", batch_size=batch_size, ) for _, chunk in jx.chunk(rows, batch_size): self._extend(chunk) return except Exception as cause2: Log.error( "smaller batches of size {{batch_size}} did not work", batch_size=batch_size, cause=cause2, ) elif len(rows) == 1: Log.error( "Could not insert document\n{{doc|json|indent}}", doc=rows[0], cause=cause, ) else: Log.error("Do not know how to handle", cause=cause) def add(self, row): self.extend([row]) def merge_shards(self): shards = [] tables = list(self.container.client.list_tables(self.container.dataset)) current_view = Null # VIEW THAT POINTS TO PRIMARY SHARD primary_shard_name = None # PRIMARY SHARD api_name = escape_name(self.short_name) for table_item in tables: table = table_item.reference table_api_name = ApiName(table.table_id) if text(table_api_name).startswith(text(api_name)): if table_api_name == api_name: if table_item.table_type != "VIEW": Log.error("expecting {{table}} to be a view", table=api_name) current_view = self.container.client.get_table(table) view_sql = current_view.view_query primary_shard_name = _extract_primary_shard_name(view_sql) elif SUFFIX_PATTERN.match(text(table_api_name)[len(text(api_name)) :]): try: known_table = self.container.client.get_table(table) shards.append(known_table) except Exception as e: Log.warning( "could not merge table {{table}}", table=table, cause=e ) if not current_view: Log.error( "expecting {{table}} to be a view pointing to a table", table=api_name ) shard_flakes = [ Snowflake.parse( big_query_schema=shard.schema, es_index=text(self.container.full_name + ApiName(shard.table_id)), top_level_fields=self.top_level_fields, partition=self.partition, ) for shard in shards ] total_flake = snowflakes.merge( shard_flakes, es_index=text(self.full_name), top_level_fields=self.top_level_fields, partition=self.partition, ) for i, s in enumerate(shards): if ApiName(s.table_id) == primary_shard_name: if total_flake == shard_flakes[i]: # USE THE CURRENT PRIMARY SHARD AS A DESTINATION del shards[i] del shard_flakes[i] break else: name = self.short_name + "_" + "".join(randoms.sample(ALLOWED, 20)) primary_shard_name = escape_name(name) self.container.create_table( table=name, schema=total_flake.schema, sharded=False, read_only=False, kwargs=self.config, ) primary_full_name = self.container.full_name + primary_shard_name selects = [] for flake, table in zip(shard_flakes, shards): q = ConcatSQL( SQL_SELECT, JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), gen_select(total_flake, flake)), SQL_FROM, quote_column(ApiName(table.dataset_id, table.table_id)), ) selects.append(q) DEBUG and Log.note( "inserting into table {{table}}", table=text(primary_shard_name) ) matched = [] unmatched = [] for sel, shard, flake in zip(selects, shards, shard_flakes): if flake == total_flake: matched.append((sel, shard, flake)) else: unmatched.append((sel, shard, flake)) # EVERYTHING THAT IS IDENTICAL TO PRIMARY CAN BE MERGED WITH SIMPLE UNION ALL if matched: for g, merge_chunk in jx.chunk(matched, MAX_MERGE): command = ConcatSQL( SQL_INSERT, quote_column(primary_full_name), JoinSQL( SQL_UNION_ALL, ( sql_query( { "from": text( self.container.full_name + ApiName(shard.table_id) ) }, schema, ) for _, shard, schema in merge_chunk ), ), ) DEBUG and Log.note("{{sql}}", sql=text(command)) job = self.container.query_and_wait(command) DEBUG and Log.note( "job {{id}} state = {{state}}", id=job.job_id, state=job.state ) if job.errors: Log.error( "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}", sql=command.sql, reason=job.errors, ) for _, shard, _ in merge_chunk: self.container.client.delete_table(shard) # ALL OTHER SCHEMAS MISMATCH for s, shard, _ in unmatched: try: command = ConcatSQL(SQL_INSERT, quote_column(primary_full_name), s) DEBUG and Log.note("{{sql}}", sql=text(command)) job = self.container.query_and_wait(command) DEBUG and Log.note( "from {{shard}}, job {{id}}, state {{state}}", id=job.job_id, shard=shard.table_id, state=job.state, ) if job.errors: if all( " does not have a schema." in m for m in wrap(job.errors).message ): pass # NOTHING TO DO else: Log.error( "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}", sql=command.sql, reason=job.errors, ) self.container.client.delete_table(shard) except Exception as e: Log.warning("failure to merge {{shard}}", shard=shard, cause=e) # REMOVE OLD VIEW view_full_name = self.container.full_name + api_name if current_view: self.container.client.delete_table(current_view) # CREATE NEW VIEW self.container.create_view(view_full_name, primary_full_name) def condense(self): """ :return: """ # MAKE NEW SHARD partition = JoinSQL( SQL_COMMA, [ quote_column(c.es_field) for f in listwrap(self.id.field) for c in self.flake.leaves(f) ], ) order_by = JoinSQL( SQL_COMMA, [ ConcatSQL(quote_column(c.es_field), SQL_DESC) for f in listwrap(self.id.version) for c in self.flake.leaves(f) ], ) # WRAP WITH etl.timestamp BEST SELECTION self.container.query_and_wait( ConcatSQL( SQL( # SOME KEYWORDS: ROWNUM RANK "SELECT * EXCEPT (_rank) FROM (SELECT *, ROW_NUMBER() OVER (PARTITION BY " ), partition, SQL_ORDERBY, order_by, SQL(") AS _rank FROM "), quote_column(self.full_name), SQL(") a WHERE _rank=1"), ) )
class ElasticsearchMetadata(Namespace): """ MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER """ @override def __new__(cls, kwargs, *args, **_kwargs): es_cluster = elasticsearch.Cluster(kwargs) output = known_clusters.get(id(es_cluster)) if output is None: output = object.__new__(cls) known_clusters[id(es_cluster)] = output return output @override def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = Relation_usingList() self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList() self.alias_to_query_paths = { "meta.columns": [['.']], "meta.tables": [['.']] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer( "meta.tables", [ # TableDesc("meta.columns", None, ".", Date.now()), # TableDesc("meta.tables", None, ".", Date.now()) ], jx_base.Schema(".", table_columns) ) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def namespace(self): return self.meta.columns.namespace @property def url(self): return self.es_cluster.url / self.default_name.replace(".", "/") def _reload_columns(self, table_desc): """ :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS) :return: """ # FIND ALL INDEXES OF ALIAS es_last_updated = self.es_cluster.metatdata_last_updated alias = table_desc.name canonical_index = self.es_cluster.get_best_matching_index(alias).index update_required = not (table_desc.timestamp < es_last_updated) metadata = self.es_cluster.get_metadata(force=update_required) indexes = self.index_to_alias.get_domain(alias) props = [ (self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties) for i, d in metadata.indices.items() if i in indexes for t, m in [_get_best_type_from_mapping(d.mappings)] ] # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT dirty = False all_comparisions = list(jx.pairwise(props)) + list(jx.pairwise(jx.reverse(props))) # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE for (i1, t1, p1), (i2, t2, p2) in all_comparisions: diff = elasticsearch.diff_schema(p2, p1) if not self.settings.read_only: for d in diff: dirty = True i1.add_property(*d) meta = self.es_cluster.get_metadata(force=dirty).indices[canonical_index] data_type, mapping = _get_best_type_from_mapping(meta.mappings) mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"} self._parse_properties(alias, mapping, meta) table_desc.timestamp = es_last_updated def _parse_properties(self, alias, mapping, meta): abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties) if any(c.cardinality == 0 and c.names['.'] != '_id' for c in abs_columns): Log.warning( "Some columns are not stored {{names}}", names=[ ".".join((c.es_index, c.names['.'])) for c in abs_columns if c.cardinality == 0 ] ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(SELF_PATH) query_paths.append(ROOT_PATH) self.alias_to_query_paths[alias] = query_paths for i in self.index_to_alias.get_domain(alias): self.alias_to_query_paths[i] = query_paths # ADD RELATIVE NAMES for abs_column in abs_columns: abs_column.last_updated = None abs_column.jx_type = jx_type(abs_column) for query_path in query_paths: abs_column.names[query_path[0]] = relative_field(abs_column.names["."], query_path[0]) self.todo.add(self.meta.columns.add(abs_column)) pass def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__() ))) def _find_alias(self, name): if self.metadata_last_updated < self.es_cluster.metatdata_last_updated: for a in self.es_cluster.get_aliases(): self.index_to_alias[a.index] = coalesce(a.alias, a.index) self.alias_last_updated.setdefault(a.alias, Date.MIN) if name in self.alias_last_updated: return name else: return self.index_to_alias[name] def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: last_update = MAX([ self.es_cluster.index_last_updated[i] for i in self.index_to_alias.get_domain(alias) ]) table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc( name=alias, url=None, query_path=['.'], timestamp=Date.MIN ) with self.meta.tables.locker: self.meta.tables.add(table) self._reload_columns(table) elif force or table.timestamp < last_update: self._reload_columns(table) columns = self.meta.columns.find(alias, column_name) columns = jx.sort(columns, "names.\\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: if len(columns) > 10: Log.note("waiting for {{num}} columns to update", num=len([c for c in columns if not c.last_updated])) else: Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) return [] def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return if column.es_index == "meta.tables": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return es_index = column.es_index.split(".")[0] is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": {"filter": {"match_all": {}}} }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": {"match_all": {}}, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 multi = 1 else: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column), "multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}} }, "size": 0 }) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": column.es_column}}} } elif cardinality == 0: query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}} result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index and is_test_table: # WE EXPECT TEST TABLES TO DISAPPEAR self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) self.index_does_not_exist.add(column.es_index) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[Date(t).format() for t in wrap(old_columns).last_updated] ) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if column: if column is THREAD_STOP: continue with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG): if column.es_index in self.index_does_not_exist: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) continue if column.jx_type in STRUCT or column.es_column.endswith("." + EXISTS_TYPE): column.last_updated = Date.now() continue elif column.last_updated >= Date.now()-TOO_OLD: continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column) except Exception as e: if '"status":404' in e: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) else: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: c = self.todo.pop() if c == THREAD_STOP: break if c.last_updated >= Date.now()-TOO_OLD: continue with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": c}, silent=not DEBUG, too_long=0.05): self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) def get_table(self, name): if name == "meta.columns": return self.meta.columns # return self.meta.columns with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == name]) def get_snowflake(self, fact_table_name): return Snowflake(fact_table_name, self) def get_schema(self, name): if name == "meta.columns": return self.meta.columns.schema query_path = split_field(name) root, rest = query_path[0], join_field(query_path[1:]) return self.get_snowflake(root).get_schema(rest)
class ElasticsearchMetadata(Namespace): """ MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER """ @override def __new__(cls, kwargs, *args, **_kwargs): es_cluster = elasticsearch.Cluster(kwargs) output = known_clusters.get(id(es_cluster)) if output is None: output = object.__new__(cls) known_clusters[id(es_cluster)] = output return output @override def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = {} self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList(URL(self.es_cluster.settings.host).host) self.alias_to_query_paths = { "meta.columns": [ROOT_PATH], "meta.tables": [ROOT_PATH] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer("meta.tables", [], jx_base.Schema(".", table_columns)) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("not refresh metadata", self.not_monitor) return @property def namespace(self): return self.meta.columns.namespace @property def url(self): return self.es_cluster.url / self.default_name.replace(".", "/") def _reload_columns(self, table_desc): """ :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS) :return: """ # FIND ALL INDEXES OF ALIAS es_last_updated = self.es_cluster.metatdata_last_updated alias = table_desc.name canonical_index = self.es_cluster.get_best_matching_index(alias).index es_metadata_update_required = not (table_desc.timestamp < es_last_updated) metadata = self.es_cluster.get_metadata( force=es_metadata_update_required) props = [(self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties) for i, d in metadata.indices.items() if alias in d.aliases for t, m in [_get_best_type_from_mapping(d.mappings)]] # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT dirty = False all_comparisions = list(jx.pairwise(props)) + list( jx.pairwise(jx.reverse(props))) # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE for (i1, t1, p1), (i2, t2, p2) in all_comparisions: diff = elasticsearch.diff_schema(p2, p1) if not self.settings.read_only: for d in diff: dirty = True i1.add_property(*d) meta = self.es_cluster.get_metadata( force=dirty).indices[canonical_index] data_type, mapping = _get_best_type_from_mapping(meta.mappings) mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"} columns = self._parse_properties(alias, mapping) table_desc.timestamp = es_last_updated return columns def _parse_properties(self, alias, mapping): abs_columns = elasticsearch.parse_properties(alias, ".", ROOT_PATH, mapping.properties) if DEBUG and any(c.cardinality == 0 and c.name != '_id' for c in abs_columns): Log.warning( "Some columns are not stored in {{url}} {{index|quote}} table:\n{{names}}", url=self.es_cluster.url, index=alias, names=[ ".".join((c.es_index, c.name)) for c in abs_columns if c.cardinality == 0 ]) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ENSURE ALL TABLES HAVE THE QUERY PATHS SET self.alias_to_query_paths[alias] = query_paths for i, a in self.index_to_alias.items(): if a == alias: self.alias_to_query_paths[i] = query_paths # ENSURE COLUMN HAS CORRECT jx_type # PICK DEEPEST NESTED PROPERTY AS REPRESENTATIVE output = [] best = {} for abs_column in abs_columns: abs_column.jx_type = jx_type(abs_column) if abs_column.jx_type not in STRUCT: clean_name = unnest_path(abs_column.name) other = best.get(clean_name) if other: if len(other.nested_path) < len( abs_column.nested_path): output.remove(other) self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_column": other.es_column, "es_index": other.es_index } } }) else: continue best[clean_name] = abs_column output.append(abs_column) # REGISTER ALL COLUMNS canonicals = [] for abs_column in output: canonical = self.meta.columns.add(abs_column) canonicals.append(canonical) self.todo.extend(canonicals) return canonicals def query(self, _query): return self.meta.columns.query( QueryOp( set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__()))) def _find_alias(self, name): if self.metadata_last_updated < self.es_cluster.metatdata_last_updated: for a in self.es_cluster.get_aliases(): self.index_to_alias[a.index] = coalesce(a.alias, a.index) self.alias_last_updated.setdefault(a.alias, Date.MIN) if name in self.alias_last_updated: return name else: return self.index_to_alias.get(name) def get_columns(self, table_name, column_name=None, after=None, timeout=None): """ RETURN METADATA COLUMNS :param table_name: TABLE WE WANT COLUMNS FOR :param column_name: OPTIONAL NAME, IF INTERESTED IN ONLY ONE COLUMN :param after: FORCE LOAD, WAITING FOR last_updated TO BE AFTER THIS TIME :param timeout: Signal; True when should give up :return: """ DEBUG and after and Log.note("getting columns for after {{time}}", time=after) table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc(name=alias, url=None, query_path=["."], timestamp=Date.MIN) with self.meta.tables.locker: self.meta.tables.add(table) columns = self._reload_columns(table) DEBUG and Log.note("columns from reload") elif after or table.timestamp < self.es_cluster.metatdata_last_updated: columns = self._reload_columns(table) DEBUG and Log.note("columns from reload") else: columns = self.meta.columns.find(alias, column_name) DEBUG and Log.note("columns from find()") DEBUG and Log.note("columns are {{ids}}", ids=[id(c) for c in columns]) columns = jx.sort(columns, "name") if after is None: return columns # DO NOT WAIT FOR COMPLETE COLUMNS # WAIT FOR THE COLUMNS TO UPDATE while True: pending = [ c for c in columns if after >= c.last_updated or ( c.cardinality == None and c.jx_type not in STRUCT) ] if not pending: break if timeout: Log.error("trying to gets columns timed out") if DEBUG: if len(pending) > 10: Log.note( "waiting for {{num}} columns to update by {{timestamp}}", num=len(pending), timestamp=after) else: Log.note( "waiting for columns to update by {{timestamp}}; {{columns|json}}", timestamp=after, columns=[ c.es_index + "." + c.es_column + " id=" + text_type(id(c)) for c in pending ]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Failure to get columns for {{table}}", table=table_name, cause=e) return [] def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ now = Date.now() if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return if column.es_index == "meta.tables": partitions = jx.sort([ g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None ]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return es_index = column.es_index.split(".")[0] is_text = [ cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text" ] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": { "filter": { "match_all": {} } } }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": { "match_all": {} }, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": [False, True], "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return else: es_query = { "aggs": { "count": _counting_query(column), "_filter": { "aggs": { "multi": { "max": { "script": "doc[" + quote(column.es_column) + "].values.size()" } } }, "filter": { "bool": { "should": [{ "range": { "etl.timestamp.~n~": { "gte": (Date.today() - WEEK) } } }, { "bool": { "must_not": { "exists": { "field": "etl.timestamp.~n~" } } } }] } } } }, "size": 0 } result = self.es_cluster.post("/" + es_index + "/_search", data=es_query) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results._filter.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif cardinality > 1000 or (count >= 30 and cardinality == count ) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": { "path": column.nested_path[0] }, "aggs": { "_nested": { "terms": { "field": column.es_column } } } } elif cardinality == 0: # WHEN DOES THIS HAPPEN? query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = { "terms": { "field": column.es_column, "size": cardinality } } result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) DEBUG and Log.note( "update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}", id=id(column), column=column, time=now) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": now }, "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any( w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith( (TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index: # WE EXPECT TEST TABLES TO DISAPPEAR Log.warning("Missing index {{col.es_index}}", col=column, cause=e) self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) self.index_does_not_exist.add(column.es_index) elif "No field found for" in e: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get column {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) else: self.meta.columns.update({ "set": { "last_updated": now }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) Log.warning( "Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if ((c.last_updated < Date.now() - MAX_COLUMN_METADATA_AGE) or c.cardinality == None) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[ Date(t).format() for t in wrap(old_columns).last_updated ]) self.todo.extend(old_columns) else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds)) if column: if column is THREAD_STOP: continue with Timer("update {{table}}.{{column}}", param={ "table": column.es_index, "column": column.es_column }, silent=not DEBUG): if column.es_index in self.index_does_not_exist: DEBUG and Log.note( "{{column.es_column}} does not exist", column=column) self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index } } }) continue if column.jx_type in STRUCT or split_field( column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note( "{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now( ) - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note( "{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now() - Date(column.last_updated)).seconds) continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX) ) and Log.note("updated {{column.name}}", column=column) except Exception as e: if '"status":404' in e: self.meta.columns.update({ "clear": ".", "where": { "eq": { "es_index": column.es_index, "es_column": column.es_column } } }) else: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: column = self.todo.pop() if column == THREAD_STOP: break if column.jx_type in STRUCT or split_field( column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note("{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now( ) - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note( "{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now() - Date(column.last_updated)).seconds) continue with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05): if untype_path(column.name) in ["build.type", "run.type"]: try: self._update_cardinality(column) except Exception as e: Log.warning( "problem getting cardinality for {{column.name}}", column=column, cause=e) else: column.last_updated = Date.now() def get_table(self, name): if name == "meta.columns": return self.meta.columns with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == name]) def get_snowflake(self, fact_table_name): return Snowflake(fact_table_name, self) def get_schema(self, name): if name == "meta.columns": return self.meta.columns.schema if name == "meta.tables": return self.meta.tables root, rest = tail_field(name) return self.get_snowflake(root).get_schema(rest)
class Extract(object): @override def __init__(self, kwargs=None): self.settings = kwargs self.schema = SnowflakeSchema(self.settings.snowflake) self._extract = extract = kwargs.extract # SOME PREP get_git_revision() # VERIFY WE DO NOT HAVE TOO MANY OTHER PROCESSES WORKING ON STUFF with MySQL(**kwargs.snowflake.database) as db: processes = None try: processes = jx.filter( db.query("show processlist"), { "and": [{ "neq": { "Command": "Sleep" } }, { "neq": { "Info": "show processlist" } }] }) except Exception as e: Log.warning("no database", cause=e) if processes: if DEBUG: Log.warning("Processes are running\n{{list|json}}", list=processes) else: Log.error("Processes are running\n{{list|json}}", list=processes) extract.type = listwrap(extract.type) extract.start = listwrap(extract.start) extract.batch = listwrap(extract.batch) extract.field = listwrap(extract.field) if any( len(extract.type) != len(other) for other in [extract.start, extract.batch, extract.field]): Log.error( "Expecting same number of dimensions for `type`, `start`, `batch`, and `field` in the `extract` inner object" ) for i, t in enumerate(extract.type): if t == "time": extract.start[i] = Date(extract.start[i]) extract.batch[i] = Duration(extract.batch[i]) elif t == "number": pass else: Log.error('Expecting `extract.type` to be "number" or "time"') extract.threads = coalesce(extract.threads, 1) self.done_pulling = Signal() self.queue = Queue("all batches", max=2 * coalesce(extract.threads, 1), silent=True) self.bucket = s3.Bucket(self.settings.destination) self.notify = aws.Queue(self.settings.notify) Thread.run("get records", self.pull_all_remaining) def pull_all_remaining(self, please_stop): try: try: content = File(self.settings.extract.last).read_json() if len(content) == 1: Log.note("Got a manually generated file {{filename}}", filename=self.settings.extract.last) start_point = tuple(content[0]) first_value = [ self._extract.start[0] + (start_point[0] * DAY), start_point[1] ] else: Log.note("Got a machine generated file {{filename}}", filename=self.settings.extract.last) start_point, first_value = content start_point = tuple(start_point) Log.note("First value is {{start1|date}}, {{start2}}", start1=first_value[0], start2=first_value[1]) except Exception as _: Log.error( "Expecting a file {{filename}} with the last good S3 bucket etl id in array form eg: [[954, 0]]", filename=self.settings.extract.last) start_point = tuple(self._extract.start) first_value = Null counter = Counter(start=0) for t, s, b, f, i in reversed( zip(self._extract.type, self._extract.start, self._extract.batch, listwrap(first_value) + DUMMY_LIST, range(len(self._extract.start)))): if t == "time": counter = DurationCounter(start=s, duration=b, child=counter) first_value[i] = Date(f) else: counter = BatchCounter(start=s, size=b, child=counter) batch_size = self._extract.batch.last( ) * 2 * self.settings.extract.threads with MySQL(**self.settings.snowflake.database) as db: while not please_stop: sql = self._build_list_sql(db, first_value, batch_size + 1) pending = [] counter.reset(start_point) with Timer("Grab a block of ids for processing"): with closing(db.db.cursor()) as cursor: acc = [] cursor.execute(sql) count = 0 for row in cursor: detail_key = counter.next(row) key = tuple(detail_key[:-1]) count += 1 if key != start_point: if first_value: if not acc: Log.error( "not expected, {{filename}} is probably set too far in the past", filename=self.settings.extract. last) pending.append({ "start_point": start_point, "first_value": first_value, "data": acc }) acc = [] start_point = key first_value = row acc.append( row[-1] ) # ASSUME LAST COLUMN IS THE FACT TABLE id Log.note("adding {{num}} for processing", num=len(pending)) self.queue.extend(pending) if count < batch_size: self.queue.add(THREAD_STOP) break except Exception as e: Log.warning("Problem pulling data", cause=e) finally: self.done_pulling.go() Log.note("pulling new data is done") def _build_list_sql(self, db, first, batch_size): # TODO: ENSURE THE LAST COLUMN IS THE id if first: dim = len(self._extract.field) where = SQL_OR.join( sql_iso( sql_and( quote_column(f) + ineq(i, e, dim) + db.quote_value(Date(v) if t == "time" else v) for e, (f, v, t) in enumerate( zip(self._extract.field[0:i + 1:], first, self._extract.type[0:i + 1:])))) for i in range(dim)) else: where = SQL_TRUE selects = [] for t, f in zip(self._extract.type, self._extract.field): if t == "time": selects.append( "CAST" + sql_iso(sql_alias(quote_column(f), SQL("DATETIME(6)")))) else: selects.append(quote_column(f)) sql = (SQL_SELECT + sql_list(selects) + SQL_FROM + self.settings.snowflake.fact_table + SQL_WHERE + where + SQL_ORDERBY + sql_list(quote_column(f) for f in self._extract.field) + SQL_LIMIT + db.quote_value(batch_size)) return sql def extract(self, db, start_point, first_value, data, please_stop): Log.note( "Starting scan of {{table}} at {{id}} and sending to batch {{start_point}}", table=self.settings.snowflake.fact_table, id=first_value, start_point=start_point) id = quote_column(self._extract.field.last()) ids = (SQL_SELECT + id + SQL_FROM + self.settings.snowflake.fact_table + SQL_WHERE + id + " in " + sql_iso(sql_list(map(db.quote_value, data)))) sql = self.schema.get_sql(ids) with Timer("Sending SQL"): cursor = db.query(sql, stream=True, row_tuples=True) extract = self.settings.extract fact_table = self.settings.snowflake.fact_table with TempFile() as temp_file: parent_etl = None for s in start_point: parent_etl = {"id": s, "source": parent_etl} parent_etl["revision"] = get_git_revision() parent_etl["machine"] = machine_metadata def append(value, i): """ :param value: THE DOCUMENT TO ADD :return: PleaseStop """ temp_file.append( convert.value2json({ fact_table: elasticsearch.scrub(value), "etl": { "id": i, "source": parent_etl, "timestamp": Date.now() } })) with Timer("assemble data"): self.construct_docs(cursor, append, please_stop) # WRITE TO S3 s3_file_name = ".".join(map(text_type, start_point)) with Timer("write to destination {{filename}}", param={"filename": s3_file_name}): if not isinstance(self.settings.destination, text_type): destination = self.bucket.get_key(s3_file_name, must_exist=False) destination.write_lines(temp_file) else: destination = File(self.settings.destination) destination.write( convert.value2json( [convert.json2value(o) for o in temp_file], pretty=True)) return False # NOTIFY SQS now = Date.now() self.notify.add({ "bucket": self.settings.destination.bucket, "key": s3_file_name, "timestamp": now.unix, "date/time": now.format() }) # SUCCESS!! File(extract.last).write(convert.value2json([start_point, first_value])) def construct_docs(self, cursor, append, please_stop): """ :param cursor: ITERATOR OF RECORDS :param append: METHOD TO CALL WITH CONSTRUCTED DOCUMENT :return: (count, first, next, next_key) number of documents added the first document in the batch the first document of the next batch """ null_values = set(self.settings.snowflake.null_values) | {None} count = 0 rownum = 0 columns = tuple(wrap(c) for c in self.schema.columns) with Timer("Downloading from MySQL"): curr_record = Null for rownum, row in enumerate(cursor): if please_stop: Log.error("Got `please_stop` signal") nested_path = [] next_record = None for c, value in zip(columns, row): if value in null_values: continue if len(nested_path) < len(c.nested_path): nested_path = unwrap(c.nested_path) next_record = Data() next_record[c.put] = value if len(nested_path) > 1: path = nested_path[-2] children = curr_record[path] if children == None: children = curr_record[path] = wrap([]) if len(nested_path) > 2: parent_path = path for path in list(reversed(nested_path[0:-2:])): parent = children.last() relative_path = relative_field(path, parent_path) children = parent[relative_path] if children == None: children = parent[relative_path] = wrap([]) parent_path = path children.append(next_record) continue if curr_record == next_record: Log.error("not expected") if curr_record: append(curr_record["id"], count) count += 1 curr_record = next_record # DEAL WITH LAST RECORD if curr_record: append(curr_record["id"], count) count += 1 Log.note("{{num}} documents ({{rownum}} db records)", num=count, rownum=rownum)
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): global singlton if singlton: return singlton else: singlton = object.__new__(cls) return singlton @override def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.list_usingPythonList import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(kwargs=kwargs) self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.last_es_metadata = Date.now()-OLD_METADATA self.meta=Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer("meta.tables", [], wrap({c.name: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == table_name]) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = self.meta.columns.find(c.table, c.name) if not existing_columns: self.meta.columns.add(c) self.todo.add(c) if ENABLE_META_SCAN: if DEBUG: Log.note("todo: {{table}}::{{column}}", table=c.table, column=c.es_column) # MARK meta.columns AS DIRTY TOO cols = self.meta.columns.find("meta.columns", None) for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical.relative and not c.relative: return # RELATIVE COLUMNS WILL SHADOW ABSOLUTE COLUMNS for key in Column.__slots__: canonical[key] = c[key] if DEBUG: Log.note("todo: {{table}}::{{column}}", table=canonical.table, column=canonical.es_column) self.todo.add(canonical) def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE meta = self.es_metadata.indices[table] if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: self.es_metadata = self.default_es.get_metadata(force=True) meta = self.es_metadata.indices[table] self._parse_properties(meta.index, Data(properties={"_id": {"type": "string", "index": "not_analyzed"}}), meta) for _, properties in meta.mappings.items(): self._parse_properties(meta.index, properties, meta) def _parse_properties(self, abs_index, properties, meta): abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column.startswith("previous_values.cf_") and not r.es_index.startswith("debug") and r.es_column.find("=")==-1 and r.es_column.find(" ")==-1 ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): def add_column(c, query_path): c.last_updated = Date.now() c.table = join_field([c.es_index]+split_field(query_path[0])) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.table = join_field([alias]+split_field(query_path[0])) self._upsert_column(c) # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: full_path = abs_column.nested_path abs_depth = len(full_path)-1 abs_parent = full_path[1] if abs_depth else "" for query_path in query_paths: rel_depth = len(query_path)-1 rel_parent = query_path[0] rel_column = copy(abs_column) rel_column.relative = True add_column(copy(abs_column), query_path) if rel_parent == ".": add_column(rel_column, query_path) elif abs_column.es_column.startswith(rel_parent+"."): rel_column.name = abs_column.es_column[len(rel_parent)+1:] add_column(rel_column, query_path) elif abs_column.es_column == rel_parent: rel_column.name = "." add_column(rel_column, query_path) elif not abs_parent: # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o) # AND THEN REMOVE THE SHADOWED rel_column.name = "." + ("." * (rel_depth - abs_depth)) + abs_column.es_column add_column(rel_column, query_path) elif rel_parent.startswith(abs_parent+"."): rel_column.name = "." + ("." * (rel_depth - abs_depth)) + abs_column.es_column add_column(rel_column, query_path) elif rel_parent != abs_parent: # SIBLING NESTED PATHS ARE INVISIBLE pass else: Log.error("logic error") def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__() ))) def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ try: # LAST TIME WE GOT INFO FOR THIS TABLE short_name = join_field(split_field(table_name)[0:1]) table = self.get_table(short_name)[0] if not table: table = Table( name=short_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=short_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=short_name) with self.meta.columns.locker: columns = self.meta.columns.find(table_name, column_name) if columns: columns = jx.sort(columns, "name") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception, e: Log.error("Not expected", cause=e) if column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=column_name) else: self._get_columns(table=table_name) Log.error("no columns for {{table}}?!", table=table_name)
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): global singlton if singlton: return singlton else: singlton = object.__new__(cls) return singlton @override def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.list_usingPythonList import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(kwargs=kwargs) self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.last_es_metadata = Date.now()-OLD_METADATA self.meta=Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == table_name]) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = self.meta.columns.find(c.es_index, c.names["."]) if not existing_columns: self.meta.columns.add(c) self.todo.add(c) if ENABLE_META_SCAN: if DEBUG: Log.note("todo: {{table}}::{{column}}", table=c.es_index, column=c.es_column) # MARK meta.columns AS DIRTY TOO cols = self.meta.columns.find("meta.columns", None) for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical is not c: set_default(c.names, canonical.names) for key in Column.__slots__: canonical[key] = c[key] if DEBUG: Log.note("todo: {{table}}::{{column}}", table=canonical.es_index, column=canonical.es_column) self.todo.add(canonical) def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE table_path = split_field(table) es_index = table_path[0] query_path = join_field(table_path[1:]) meta = self.es_metadata.indices[es_index] if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: self.es_metadata = self.default_es.get_metadata(force=True) meta = self.es_metadata.indices[es_index] for _, properties in meta.mappings.items(): properties.properties["_id"] = {"type": "string", "index": "not_analyzed"} self._parse_properties(meta.index, properties, meta) def _parse_properties(self, abs_index, properties, meta): # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column.startswith("previous_values.cf_") and not r.es_index.startswith("debug") and r.es_column.find("=") == -1 and r.es_column.find(" ") == -1 ) def add_column(c, query_path): c.last_updated = Date.now() if query_path[0] != ".": c.names[query_path[0]] = relative_field(c.names["."], query_path[0]) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.es_index = alias self._upsert_column(c) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: for query_path in query_paths: add_column(abs_column, query_path) def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__() ))) def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ table_path = split_field(table_name) es_index_name = table_path[0] query_path = join_field(table_path[1:]) table = self.get_table(es_index_name)[0] abs_column_name = None if column_name == None else concat_field(query_path, column_name) try: # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = Table( name=es_index_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=es_index_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=es_index_name) with self.meta.columns.locker: columns = self.meta.columns.find(es_index_name, column_name) if columns: columns = jx.sort(columns, "names.\.") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): if DEBUG: Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Not expected", cause=e) if abs_column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name) else: self._get_columns(table=table_name) # TO TEST WHAT HAPPENED Log.error("no columns for {{table}}?!", table=table_name) def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in STRUCT: Log.error("not supported") try: if c.es_index == "meta.columns": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return if c.es_index == "meta.tables": with self.meta.columns.locker: partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return es_index = c.es_index.split(".")[0] result = self.default_es.post("/" + es_index + "/_search", data={ "aggs": {c.names["."]: _counting_query(c)}, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count==0 else None) if cardinality == None: Log.error("logic error") query = Data(size=0) if cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): if DEBUG: Log.note("{{table}}.{{field}} has {{num}} parts", table=c.es_index, field=c.es_column, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: if DEBUG: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) return elif len(c.nested_path) != 1: query.aggs[literal_field(c.names["."])] = { "nested": {"path": c.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": c.es_column, "size": 0}}} } else: query.aggs[literal_field(c.names["."])] = {"terms": {"field": c.es_column, "size": 0}} result = self.default_es.post("/" + es_index + "/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) if DEBUG: Log.note("{{field}} has {{parts}}", field=c.name, parts=parts) with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) except Exception as e: if "IndexMissingException" in e and c.es_index.startswith(TEST_TABLE_PREFIX): with self.meta.columns.locker: self.meta.columns.update({ "set": { "count": 0, "cardinality": 0, "last_updated": Date.now() }, "clear":[ "partitions" ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) else: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear": [ "count", "cardinality", "partitions", ], "where": {"eq": {"names.\\.": ".", "es_index": c.es_index, "es_column": c.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=c, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: with self.meta.columns.locker: old_columns = [ c for c in self.meta.columns if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.type not in STRUCT ] if old_columns: if DEBUG: Log.note("Old columns wth dates {{dates|json}}", dates=wrap(old_columns).last_updated) self.todo.extend(old_columns) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.es_column == d.es_column and c.es_index == d.es_index and c != d: Log.error("") else: if DEBUG: Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if column: if DEBUG: Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column) if column.type in STRUCT: with self.meta.columns.locker: column.last_updated = Date.now() continue elif column.last_updated >= Date.now()-TOO_OLD: continue try: self._update_cardinality(column) if DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX): Log.note("updated {{column.name}}", column=column) except Exception as e: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: c = self.todo.pop() if c == THREAD_STOP: break if not c.last_updated or c.last_updated >= Date.now()-TOO_OLD: continue with self.meta.columns.locker: self.meta.columns.update({ "set": { "last_updated": Date.now() }, "clear":[ "count", "cardinality", "partitions", ], "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}} }) if DEBUG: Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
class ElasticsearchMetadata(Namespace): """ MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER """ @override def __new__(cls, kwargs, *args, **_kwargs): es_cluster = elasticsearch.Cluster(kwargs) output = known_clusters.get(id(es_cluster)) if output is None: output = object.__new__(cls) known_clusters[id(es_cluster)] = output return output @override def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = {} self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList(URL(self.es_cluster.settings.host).host) self.alias_to_query_paths = { "meta.columns": [ROOT_PATH], "meta.tables": [ROOT_PATH] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer( "meta.tables", [], jx_base.Schema(".", table_columns) ) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("not refresh metadata", self.not_monitor) return @property def namespace(self): return self.meta.columns.namespace @property def url(self): return self.es_cluster.url / self.default_name.replace(".", "/") def _reload_columns(self, table_desc): """ :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS) :return: """ # FIND ALL INDEXES OF ALIAS es_last_updated = self.es_cluster.metatdata_last_updated alias = table_desc.name canonical_index = self.es_cluster.get_best_matching_index(alias).index es_metadata_update_required = not (table_desc.timestamp < es_last_updated) metadata = self.es_cluster.get_metadata(force=es_metadata_update_required) props = [ (self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties) for i, d in metadata.indices.items() if alias in d.aliases for t, m in [_get_best_type_from_mapping(d.mappings)] ] # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT dirty = False all_comparisions = list(jx.pairwise(props)) + list(jx.pairwise(jx.reverse(props))) # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE for (i1, t1, p1), (i2, t2, p2) in all_comparisions: diff = elasticsearch.diff_schema(p2, p1) if not self.settings.read_only: for d in diff: dirty = True i1.add_property(*d) meta = self.es_cluster.get_metadata(force=dirty).indices[canonical_index] data_type, mapping = _get_best_type_from_mapping(meta.mappings) mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"} columns = self._parse_properties(alias, mapping) table_desc.timestamp = es_last_updated return columns def _parse_properties(self, alias, mapping): abs_columns = elasticsearch.parse_properties(alias, ".", ROOT_PATH, mapping.properties) if DEBUG and any(c.cardinality == 0 and c.name != '_id' for c in abs_columns): Log.warning( "Some columns are not stored in {{url}} {{index|quote}} table:\n{{names}}", url=self.es_cluster.url, index=alias, names=[ ".".join((c.es_index, c.name)) for c in abs_columns if c.cardinality == 0 ] ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG): # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ENSURE ALL TABLES HAVE THE QUERY PATHS SET self.alias_to_query_paths[alias] = query_paths for i, a in self.index_to_alias.items(): if a == alias: self.alias_to_query_paths[i] = query_paths # ENSURE COLUMN HAS CORRECT jx_type # PICK DEEPEST NESTED PROPERTY AS REPRESENTATIVE output = [] best = {} for abs_column in abs_columns: abs_column.jx_type = jx_type(abs_column) if abs_column.jx_type not in STRUCT: clean_name = unnest_path(abs_column.name) other = best.get(clean_name) if other: if len(other.nested_path) < len(abs_column.nested_path): output.remove(other) self.meta.columns.update({"clear": ".", "where": {"eq": {"es_column": other.es_column, "es_index": other.es_index}}}) else: continue best[clean_name] = abs_column output.append(abs_column) # REGISTER ALL COLUMNS canonicals = [] for abs_column in output: canonical = self.meta.columns.add(abs_column) canonicals.append(canonical) self.todo.extend(canonicals) return canonicals def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.__data__() ))) def _find_alias(self, name): if self.metadata_last_updated < self.es_cluster.metatdata_last_updated: for a in self.es_cluster.get_aliases(): self.index_to_alias[a.index] = coalesce(a.alias, a.index) self.alias_last_updated.setdefault(a.alias, Date.MIN) if name in self.alias_last_updated: return name else: return self.index_to_alias.get(name) def get_columns(self, table_name, column_name=None, after=None, timeout=None): """ RETURN METADATA COLUMNS :param table_name: TABLE WE WANT COLUMNS FOR :param column_name: OPTIONAL NAME, IF INTERESTED IN ONLY ONE COLUMN :param after: FORCE LOAD, WAITING FOR last_updated TO BE AFTER THIS TIME :param timeout: Signal; True when should give up :return: """ DEBUG and after and Log.note("getting columns for after {{time}}", time=after) table_path = split_field(table_name) root_table_name = table_path[0] alias = self._find_alias(root_table_name) if not alias: self.es_cluster.get_metadata(force=True) alias = self._find_alias(root_table_name) if not alias: Log.error("{{table|quote}} does not exist", table=table_name) try: table = self.get_table(alias)[0] # LAST TIME WE GOT INFO FOR THIS TABLE if not table: table = TableDesc( name=alias, url=None, query_path=["."], timestamp=Date.MIN ) with self.meta.tables.locker: self.meta.tables.add(table) columns = self._reload_columns(table) DEBUG and Log.note("columns from reload") elif after or table.timestamp < self.es_cluster.metatdata_last_updated: columns = self._reload_columns(table) DEBUG and Log.note("columns from reload") else: columns = self.meta.columns.find(alias, column_name) DEBUG and Log.note("columns from find()") DEBUG and Log.note("columns are {{ids}}", ids=[id(c) for c in columns]) columns = jx.sort(columns, "name") if after is None: return columns # DO NOT WAIT FOR COMPLETE COLUMNS # WAIT FOR THE COLUMNS TO UPDATE while True: pending = [c for c in columns if after >= c.last_updated or (c.cardinality == None and c.jx_type not in STRUCT)] if not pending: break if timeout: Log.error("trying to gets columns timed out") if DEBUG: if len(pending) > 10: Log.note("waiting for {{num}} columns to update by {{timestamp}}", num=len(pending), timestamp=after) else: Log.note("waiting for columns to update by {{timestamp}}; {{columns|json}}", timestamp=after, columns=[c.es_index + "." + c.es_column + " id="+text_type(id(c)) for c in pending]) Till(seconds=1).wait() return columns except Exception as e: Log.error("Failure to get columns for {{table}}", table=table_name, cause=e) return [] def _update_cardinality(self, column): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ now = Date.now() if column.es_index in self.index_does_not_exist: return if column.jx_type in STRUCT: Log.error("not supported") try: if column.es_index == "meta.columns": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.columns), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return if column.es_index == "meta.tables": partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None]) self.meta.columns.update({ "set": { "partitions": partitions, "count": len(self.meta.tables), "cardinality": len(partitions), "multi": 1, "last_updated": now }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return es_index = column.es_index.split(".")[0] is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"] if is_text: # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": {"filter": {"match_all": {}}} }, "size": 0 }) count = result.hits.total cardinality = max(1001, count) multi = 1001 elif column.es_column == "_id": result = self.es_cluster.post("/" + es_index + "/_search", data={ "query": {"match_all": {}}, "size": 0 }) count = cardinality = result.hits.total multi = 1 elif column.es_type == BOOLEAN: result = self.es_cluster.post("/" + es_index + "/_search", data={ "aggs": { "count": _counting_query(column) }, "size": 0 }) count = result.hits.total cardinality = 2 DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": [False, True], "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return else: es_query = { "aggs": { "count": _counting_query(column), "_filter": { "aggs": {"multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}}}, "filter": {"bool": {"should": [ {"range": {"etl.timestamp.~n~": {"gte": (Date.today() - WEEK)}}}, {"bool": {"must_not": {"exists": {"field": "etl.timestamp.~n~"}}}} ]}} } }, "size": 0 } result = self.es_cluster.post("/" + es_index + "/_search", data=es_query) agg_results = result.aggregations count = result.hits.total cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count) multi = int(coalesce(agg_results._filter.multi.value, 1)) if cardinality == None: Log.error("logic error") query = Data(size=0) if column.es_column == "_id": self.meta.columns.update({ "set": { "count": cardinality, "cardinality": cardinality, "multi": 1, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "last_updated": now }, "clear": ["partitions"], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) return elif len(column.nested_path) != 1: query.aggs["_"] = { "nested": {"path": column.nested_path[0]}, "aggs": {"_nested": {"terms": {"field": column.es_column}}} } elif cardinality == 0: # WHEN DOES THIS HAPPEN? query.aggs["_"] = {"terms": {"field": column.es_column}} else: query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}} result = self.es_cluster.post("/" + es_index + "/_search", data=query) aggs = result.aggregations._ if aggs._nested: parts = jx.sort(aggs._nested.buckets.key) else: parts = jx.sort(aggs.buckets.key) DEBUG and Log.note("update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}", id=id(column), column=column, time=now) self.meta.columns.update({ "set": { "count": count, "cardinality": cardinality, "multi": multi, "partitions": parts, "last_updated": now }, "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) except Exception as e: # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING # from tests.test_jx import TEST_TABLE e = Except.wrap(e) TEST_TABLE = "testdata" is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"]) is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE)) if is_missing_index: # WE EXPECT TEST TABLES TO DISAPPEAR Log.warning("Missing index {{col.es_index}}", col=column, cause=e) self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) self.index_does_not_exist.add(column.es_index) elif "No field found for" in e: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get column {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) else: self.meta.columns.update({ "set": { "last_updated": now }, "clear": [ "count", "cardinality", "multi", "partitions", ], "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e) def monitor(self, please_stop): please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: try: if not self.todo: old_columns = [ c for c in self.meta.columns if ((c.last_updated < Date.now() - MAX_COLUMN_METADATA_AGE) or c.cardinality == None) and c.jx_type not in STRUCT ] if old_columns: DEBUG and Log.note( "Old columns {{names|json}} last updated {{dates|json}}", names=wrap(old_columns).es_column, dates=[Date(t).format() for t in wrap(old_columns).last_updated] ) self.todo.extend(old_columns) else: DEBUG and Log.note("no more metatdata to update") column = self.todo.pop(Till(seconds=(10*MINUTE).seconds)) if column: if column is THREAD_STOP: continue with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG): if column.es_index in self.index_does_not_exist: DEBUG and Log.note("{{column.es_column}} does not exist", column=column) self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index}} }) continue if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note("{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds) continue try: self._update_cardinality(column) (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column) except Exception as e: if '"status":404' in e: self.meta.columns.update({ "clear": ".", "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}} }) else: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) except Exception as e: Log.warning("problem in cardinality monitor", cause=e) def not_monitor(self, please_stop): Log.alert("metadata scan has been disabled") please_stop.on_go(lambda: self.todo.add(THREAD_STOP)) while not please_stop: column = self.todo.pop() if column == THREAD_STOP: break if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE: DEBUG and Log.note("{{column.es_column}} is a struct", column=column) column.last_updated = Date.now() continue elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None: # DO NOT UPDATE FRESH COLUMN METADATA DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds) continue with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05): if untype_path(column.name) in ["build.type", "run.type"]: try: self._update_cardinality(column) except Exception as e: Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e) else: column.last_updated = Date.now() def get_table(self, name): if name == "meta.columns": return self.meta.columns with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == name]) def get_snowflake(self, fact_table_name): return Snowflake(fact_table_name, self) def get_schema(self, name): if name == "meta.columns": return self.meta.columns.schema if name == "meta.tables": return self.meta.tables root, rest = tail_field(name) return self.get_snowflake(root).get_schema(rest)