class Log_usingThread(BaseLog): def __init__(self, logger): # DELAYED LOAD FOR THREADS MODULE from pyLibrary.thread.threads import Queue self.queue = Queue("logs", max=10000, silent=True) self.logger = logger def worker(please_stop): while not please_stop: Thread.sleep(1) logs = self.queue.pop_all() for log in logs: if log is Thread.STOP: if DEBUG_LOGGING: sys.stdout.write("Log_usingThread.worker() sees stop, filling rest of queue\n") please_stop.go() else: self.logger.write(**log) self.thread = Thread("log thread", worker) self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception, e: sys.stdout.write("IF YOU SEE THIS, IT IS LIKELY YOU FORGOT TO RUN Log.start() FIRST\n") raise e # OH NO!
def etl_one(settings): queue = Queue("temp work queue") queue.__setattr__(b"commit", Null) queue.__setattr__(b"rollback", Null) settings.param.wait_forever = False already_in_queue = set() for w in settings.workers: source = get_container(w.source) # source.settings.fast_forward = True if id(source) in already_in_queue: continue try: for i in parse_id_argument(settings.args.id): data = source.get_key(i) if data != None: already_in_queue.add(id(source)) queue.add(Dict( bucket=w.source.bucket, key=i )) except Exception, e: if "Key {{key}} does not exist" in e: already_in_queue.add(id(source)) queue.add(Dict( bucket=w.source.bucket, key=settings.args.id )) Log.warning("Problem", cause=e)
def worker(please_stop): pending = Queue("pending ids", max=BATCH_SIZE*3, silent=False) pending_thread = Thread.run( "get pending", get_pending, source=source, since=last_updated, pending_bugs=pending, please_stop=please_stop ) diff_thread = Thread.run( "diff", diff, source, destination, pending, please_stop=please_stop ) replication_thread = Thread.run( "replication", replicate, source, destination, pending, config.fix, please_stop=please_stop ) pending_thread.join() diff_thread.join() pending.add(Thread.STOP) replication_thread.join() done.go() please_stop.go()
class TextLog_usingElasticSearch(TextLog): @use_settings def __init__(self, host, index, type="log", max_size=1000, batch_size=100, settings=None): """ settings ARE FOR THE ELASTICSEARCH INDEX """ self.es = Cluster(settings).get_or_create_index( schema=convert.json2value(convert.value2json(SCHEMA), leaves=True), limit_replicas=True, tjson=True, settings=settings ) self.batch_size = batch_size self.es.add_alias(coalesce(settings.alias, settings.index)) self.queue = Queue("debug logs to es", max=max_size, silent=True) self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3) self.es.settings.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE)) Thread.run("add debug logs to es", self._insert_loop) def write(self, template, params): if params.get("template"): # DETECTED INNER TEMPLATE, ASSUME TRACE IS ON, SO DO NOT NEED THE OUTER TEMPLATE self.queue.add({"value": params}) else: template = strings.limit(template, 2000) self.queue.add({"value": {"template": template, "params": params}}, timeout=3 * MINUTE) return self def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: Thread.sleep(seconds=1) messages = wrap(self.queue.pop_all()) if messages: # for m in messages: # m.value.params = leafer(m.value.params) # m.value.error = leafer(m.value.error) for g, mm in jx.groupby(messages, size=self.batch_size): self.es.extend(mm) bad_count = 0 except Exception, e: Log.warning("Problem inserting logs into ES", cause=e) bad_count += 1 if bad_count > 5: break Log.warning("Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index) # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Thread.sleep(seconds=1) self.queue.pop_all() except Exception, e: Log.warning("Should not happen", cause=e)
class TextLog_usingElasticSearch(TextLog): @use_settings def __init__(self, host, index, type="log", max_size=1000, batch_size=100, settings=None): """ settings ARE FOR THE ELASTICSEARCH INDEX """ self.es = Cluster(settings).get_or_create_index( schema=convert.json2value(convert.value2json(SCHEMA), leaves=True), limit_replicas=True, tjson=True, settings=settings, ) self.batch_size = batch_size self.es.add_alias(coalesce(settings.alias, settings.index)) self.queue = Queue("debug logs to es", max=max_size, silent=True) self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3) self.es.settings.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE)) Thread.run("add debug logs to es", self._insert_loop) def write(self, template, params): if params.get("template"): # DETECTED INNER TEMPLATE, ASSUME TRACE IS ON, SO DO NOT NEED THE OUTER TEMPLATE self.queue.add({"value": params}) else: template = strings.limit(template, 2000) self.queue.add({"value": {"template": template, "params": params}}, timeout=3 * MINUTE) return self def _insert_loop(self, please_stop=None): bad_count = 0 while not please_stop: try: Thread.sleep(seconds=1) messages = wrap(self.queue.pop_all()) if messages: # for m in messages: # m.value.params = leafer(m.value.params) # m.value.error = leafer(m.value.error) for g, mm in jx.groupby(messages, size=self.batch_size): self.es.extend(mm) bad_count = 0 except Exception, e: Log.warning("Problem inserting logs into ES", cause=e) bad_count += 1 if bad_count > 5: break Log.warning("Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index) # CONTINUE TO DRAIN THIS QUEUE while not please_stop: try: Thread.sleep(seconds=1) self.queue.pop_all() except Exception, e: Log.warning("Should not happen", cause=e)
class Log_usingThreadedStream(BaseLog): # stream CAN BE AN OBJCET WITH write() METHOD, OR A STRING # WHICH WILL eval() TO ONE def __init__(self, stream): assert stream use_UTF8 = False if isinstance(stream, basestring): if stream.startswith("sys."): use_UTF8 = True # sys.* ARE OLD AND CAN NOT HANDLE unicode self.stream = eval(stream) name = stream else: self.stream = stream name = "stream" # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD from pyLibrary.thread.threads import Queue if use_UTF8: def utf8_appender(value): if isinstance(value, unicode): value = value.encode('utf8') self.stream.write(value) appender = utf8_appender else: appender = self.stream.write self.queue = Queue("log to stream", max=10000, silent=True) self.thread = Thread("log to " + name, time_delta_pusher, appender=appender, queue=self.queue, interval=timedelta(seconds=0.3)) self.thread.parent.remove_child( self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception, e: raise e # OH NO!
class Multithread(object): """ SIMPLE SEMANTICS FOR SYMMETRIC MULTITHREADING PASS A SET OF functions TO BE EXECUTED (ONE PER THREAD) SET outbound==False TO SIMPLY THROW AWAY RETURN VALUES, IF ANY threads - IF functions IS NOT AN ARRAY, THEN threads IS USED TO MAKE AN ARRAY THE inbound QUEUE IS EXPECTING dicts, EACH dict IS USED AS kwargs TO GIVEN functions """ def __init__(self, functions, threads=None, outbound=None, silent_queues=None): if outbound is None: self.outbound = Queue("multithread", silent=silent_queues) elif outbound is False: self.outbound = None else: self.outbound = outbound self.inbound = Queue("multithread", silent=silent_queues) # MAKE THREADS if isinstance(functions, Iterable): Log.error("Not supported anymore") self.threads = [] for t in range(coalesce(threads, 1)): thread = worker_thread("worker " + unicode(t), self.inbound, self.outbound, functions) self.threads.append(thread) def __enter__(self): return self # WAIT FOR ALL QUEUED WORK TO BE DONE BEFORE RETURNING def __exit__(self, type, value, traceback): try: if isinstance(value, Exception): self.inbound.close() for t in self.threads: t.keep_running = False else: # ADD STOP MESSAGE, ONE FOR EACH THREAD, FOR ORDERLY SHUTDOWN for t in self.threads: self.inbound.add(Thread.STOP) self.join() except Exception, e: Log.warning("Problem sending stops", e)
class Log_usingQueue(BaseLog): def __init__(self): self.queue = Queue("log messages") def write(self, template, params): self.queue.add(expand_template(template, params)) def stop(self): self.queue.close() def pop(self): lines = self.queue.pop() output = [] for l in lines.split("\n"): if l[19:22] == " - ": l = l[22:] if l.strip().startswith("File"): continue output.append(l) return "\n".join(output).strip()
class TextLog_usingThreadedStream(TextLog): # stream CAN BE AN OBJCET WITH write() METHOD, OR A STRING # WHICH WILL eval() TO ONE def __init__(self, stream): assert stream use_UTF8 = False if isinstance(stream, basestring): if stream.startswith("sys."): use_UTF8 = True # sys.* ARE OLD AND CAN NOT HANDLE unicode self.stream = eval(stream) name = stream else: self.stream = stream name = "stream" # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD from pyLibrary.thread.threads import Queue if use_UTF8: def utf8_appender(value): if isinstance(value, unicode): value = value.encode('utf8') self.stream.write(value) appender = utf8_appender else: appender = self.stream.write self.queue = Queue("log to stream", max=10000, silent=True) self.thread = Thread("log to " + name, time_delta_pusher, appender=appender, queue=self.queue, interval=timedelta(seconds=0.3)) self.thread.parent.remove_child(self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception, e: raise e # OH NO!
def find_changeset(self, revision, please_stop=False): locker = Lock() output = [] queue = Queue("branches", max=2000) queue.extend(self.branches) queue.add(Thread.STOP) problems = [] def _find(please_stop): for b in queue: if please_stop: return try: url = b.url + "json-info?node=" + revision response = http.get(url, timeout=30) if response.status_code == 200: with locker: output.append(b) Log.note("{{revision}} found at {{url}}", url=url, revision=revision) except Exception, f: problems.append(f)
def etl_one(settings): queue = Queue("temp work queue") queue.__setattr__(b"commit", Null) queue.__setattr__(b"rollback", Null) settings.param.wait_forever = False already_in_queue = set() for w in settings.workers: source = get_container(w.source) # source.settings.fast_forward = True if id(source) in already_in_queue: continue try: for i in parse_id_argument(settings.args.id): data = source.get_key(i) if data != None: already_in_queue.add(id(source)) queue.add(Dict(bucket=w.source.bucket, key=i)) except Exception, e: if "Key {{key}} does not exist" in e: already_in_queue.add(id(source)) queue.add(Dict(bucket=w.source.bucket, key=settings.args.id)) Log.warning("Problem", cause=e)
class TextLog_usingQueue(TextLog): def __init__(self, name=None): queue_name = "log messages to queue" if name: queue_name += " "+name self.queue = Queue(queue_name) def write(self, template, params): self.queue.add(expand_template(template, params)) def stop(self): self.queue.close() def pop(self): lines = self.queue.pop() output = [] for l in lines.split("\n"): if l[19:22] == " - ": l = l[22:] if l.strip().startswith("File"): continue output.append(l) return "\n".join(output).strip()
class Log_usingThread(BaseLog): def __init__(self, logger): # DELAYED LOAD FOR THREADS MODULE from pyLibrary.thread.threads import Queue self.queue = Queue("logs", max=10000, silent=True) self.logger = logger def worker(please_stop): while not please_stop: Thread.sleep(1) logs = self.queue.pop_all() for log in logs: if log is Thread.STOP: if DEBUG_LOGGING: sys.stdout.write( "Log_usingThread.worker() sees stop, filling rest of queue\n" ) please_stop.go() else: self.logger.write(**log) self.thread = Thread("log thread", worker) self.thread.parent.remove_child( self.thread) # LOGGING WILL BE RESPONSIBLE FOR THREAD stop() self.thread.start() def write(self, template, params): try: self.queue.add({"template": template, "params": params}) return self except Exception, e: sys.stdout.write( "IF YOU SEE THIS, IT IS LIKELY YOU FORGOT TO RUN Log.start() FIRST\n" ) raise e # OH NO!
class Sqlite(DB): """ Allows multi-threaded access Loads extension functions (like SQRT) """ canonical = None def __init__(self, filename=None, db=None): """ :param db: Optional, wrap a sqlite db in a thread :return: Multithread save database """ if not _upgraded: _upgrade() self.filename = filename self.db = db self.queue = Queue("sql commands") # HOLD (command, result, signal) PAIRS self.worker = Thread.run("sqlite db thread", self._worker) self.get_trace = DEBUG def execute(self, command): """ COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS :param command: COMMAND FOR SQLITE :return: None """ if self.get_trace: trace = extract_stack(1) else: trace = None self.queue.add((command, None, None, trace)) def query(self, command): """ WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED :param command: COMMAND FOR SQLITE :return: list OF RESULTS """ signal = Signal() result = Data() self.queue.add((command, result, signal, None)) signal.wait() if result.exception: Log.error("Problem with Sqlite call", cause=result.exception) return result def _worker(self, please_stop): if Sqlite.canonical: self.db = Sqlite.canonical else: self.db = sqlite3.connect(coalesce(self.filename, ':memory:')) try: full_path = File("pyLibrary/vendor/sqlite/libsqlitefunctions.so").abspath # self.db.execute("SELECT sqlite3_enable_load_extension(1)") self.db.enable_load_extension(True) self.db.execute("SELECT load_extension('" + full_path + "')") except Exception, e: Log.warning("loading sqlite extension functions failed, doing without. (no SQRT for you!)", cause=e) try: while not please_stop: if DEBUG: Log.note("begin pop") command, result, signal, trace = self.queue.pop(till=please_stop) if DEBUG: Log.note("done pop") if DEBUG: Log.note("Running command\n{{command|indent}}", command=command) with Timer("Run command", debug=DEBUG): if signal is not None: try: curr = self.db.execute(command) result.meta.format = "table" result.header = [d[0] for d in curr.description] if curr.description else None result.data = curr.fetchall() if DEBUG and result.data: text = convert.table2csv(list(result.data)) Log.note("Result:\n{{data}}", data=text) except Exception, e: e = Except.wrap(e) result.exception = Except(ERROR, "Problem with\n{{command|indent}}", command=command, cause=e) finally:
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): global singlton if singlton: return singlton else: singlton = object.__new__(cls) return singlton @use_settings def __init__(self, host, index, alias=None, name=None, port=9200, settings=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.list_usingPythonList import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = settings self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(settings=settings) self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.last_es_metadata = Date.now() - OLD_METADATA self.meta = Dict() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer( "meta.tables", [], wrap({c.name: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return wrap( [t for t in self.meta.tables.data if t.name == table_name]) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = self.meta.columns.find(c.table, c.name) if not existing_columns: self.meta.columns.add(c) self.todo.add(c) if ENABLE_META_SCAN: Log.note("todo: {{table}}::{{column}}", table=c.table, column=c.es_column) # MARK meta.columns AS DIRTY TOO cols = self.meta.columns.find("meta.columns", None) for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical.relative and not c.relative: return # RELATIVE COLUMNS WILL SHADOW ABSOLUTE COLUMNS for key in Column.__slots__: canonical[key] = c[key] Log.note("todo: {{table}}::{{column}}", table=canonical.table, column=canonical.es_column) self.todo.add(canonical) def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE meta = self.es_metadata.indices[table] if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: self.es_metadata = self.default_es.get_metadata(force=True) meta = self.es_metadata.indices[table] for _, properties in meta.mappings.items(): self._parse_properties(meta.index, properties, meta) def _parse_properties(self, abs_index, properties, meta): abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column. startswith("previous_values.cf_") and not r.es_index.startswith( "debug") and r.es_column.find("=") == -1 and r.es_column.find( " ") == -1) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): def add_column(c, query_path): c.last_updated = Date.now() c.table = join_field([c.es_index] + split_field(query_path[0])) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.table = join_field([alias] + split_field(query_path[0])) self._upsert_column(c) # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: full_path = abs_column.nested_path abs_depth = len(full_path) - 1 abs_parent = full_path[1] if abs_depth else "" for query_path in query_paths: rel_depth = len(query_path) - 1 rel_parent = query_path[0] rel_column = copy(abs_column) rel_column.relative = True add_column(copy(abs_column), query_path) if rel_parent == ".": add_column(rel_column, query_path) elif abs_column.es_column.startswith(rel_parent + "."): rel_column.name = abs_column.es_column[len(rel_parent ) + 1:] add_column(rel_column, query_path) elif abs_column.es_column == rel_parent: rel_column.name = "." add_column(rel_column, query_path) elif not abs_parent: # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o) # AND THEN REMOVE THE SHADOWED rel_column.name = "." + ( "." * (rel_depth - abs_depth)) + abs_column.es_column add_column(rel_column, query_path) elif rel_parent.startswith(abs_parent + "."): rel_column.name = "." + ( "." * (rel_depth - abs_depth)) + abs_column.es_column add_column(rel_column, query_path) elif rel_parent != abs_parent: # SIBLING NESTED PATHS ARE INVISIBLE pass else: Log.error("logic error") def query(self, _query): return self.meta.columns.query( QueryOp( set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.as_dict()))) def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ try: # LAST TIME WE GOT INFO FOR THIS TABLE short_name = join_field(split_field(table_name)[0:1]) table = self.get_table(short_name)[0] if not table: table = Table(name=short_name, url=None, query_path=None, timestamp=Date.now()) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=short_name) elif force or table.timestamp == None or table.timestamp < Date.now( ) - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=short_name) with self.meta.columns.locker: columns = self.meta.columns.find(table_name, column_name) if columns: columns = jx.sort(columns, "name") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): Log.note("waiting for columns to update {{columns|json}}", columns=[ c.table + "." + c.es_column for c in columns if not c.last_updated ]) Thread.sleep(seconds=1) return columns except Exception, e: Log.error("Not expected", cause=e) if column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=column_name) else: self._get_columns(table=table_name) Log.error("no columns for {{table}}?!", table=table_name)
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): global singlton if singlton: return singlton else: singlton = object.__new__(cls) return singlton @use_settings def __init__(self, host, index, alias=None, name=None, port=9200, settings=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.lists import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = settings self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(settings=settings) self.todo = Queue("refresh metadata", max=100000, unique=True) self.meta = Dict() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer( "meta.tables", [], wrap({c.name: c for c in table_columns})) self.meta.columns = ListContainer( "meta.columns", [], wrap({c.name: c for c in column_columns})) self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return self.meta.tables.query( {"where": { "eq": { "name": table_name } }}) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = [ r for r in self.meta.columns.data if r.table == c.table and r.name == c.name ] if not existing_columns: self.meta.columns.add(c) Log.note("todo: {{table}}.{{column}}", table=c.table, column=c.es_column) self.todo.add(c) # MARK meta.columns AS DIRTY TOO cols = [ r for r in self.meta.columns.data if r.table == "meta.columns" ] for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical.relative and not c.relative: return # RELATIVE COLUMNS WILL SHADOW ABSOLUTE COLUMNS for key in Column.__slots__: canonical[key] = c[key] Log.note("todo: {{table}}.{{column}}", table=canonical.table, column=canonical.es_column) self.todo.add(canonical) def _get_columns(self, table=None, metadata=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE if not metadata: metadata = self.default_es.get_metadata(force=True) def parse_all(please_stop): for abs_index, meta in jx.sort(metadata.indices.items(), { "value": 0, "sort": -1 }): if meta.index != abs_index: continue for _, properties in meta.mappings.items(): if please_stop: return self._parse_properties(abs_index, properties, meta) if table: for abs_index, meta in jx.sort(metadata.indices.items(), { "value": 0, "sort": -1 }): if table == meta.index: for _, properties in meta.mappings.items(): self._parse_properties(abs_index, properties, meta) return if table == abs_index: self._get_columns(table=meta.index, metadata=metadata) return else: self.parser = Thread.run("parse properties", parse_all) def _parse_properties(self, abs_index, properties, meta): abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column. startswith("previous_values.cf_") and not r.es_index.startswith( "debug")) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): def add_column(c, query_path): c.last_updated = Date.now() if query_path: c.table = c.es_index + "." + query_path.last() else: c.table = c.es_index with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) if query_path: c.table = alias + "." + query_path.last() else: c.table = alias self._upsert_column(c) # EACH query_path IS A LIST OF EVER-INCREASING PATHS THROUGH EACH NESTED LEVEL query_paths = wrap([[c.es_column] for c in abs_columns if c.type == "nested"]) for a, b in itertools.product(query_paths, query_paths): aa = a.last() bb = b.last() if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) < len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(0, aa) break query_paths.append([]) for c in abs_columns: # ADD RELATIVE COLUMNS full_path = listwrap(c.nested_path) abs_depth = len(full_path) abs_parent = coalesce(full_path.last(), "") for query_path in query_paths: rel_depth = len(query_path) # ABSOLUTE add_column(copy(c), query_path) cc = copy(c) cc.relative = True if not query_path: add_column(cc, query_path) continue rel_parent = query_path.last() if c.es_column.startswith(rel_parent + "."): cc.name = c.es_column[len(rel_parent) + 1:] add_column(cc, query_path) elif c.es_column == rel_parent: cc.name = "." add_column(cc, query_path) elif not abs_parent: # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o) # AND THEN REMOVE THE SHADOWED cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column add_column(cc, query_path) elif rel_parent.startswith(abs_parent + "."): cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column add_column(cc, query_path) elif rel_parent != abs_parent: # SIBLING NESTED PATHS ARE INVISIBLE pass else: Log.error("logic error") def query(self, _query): return self.meta.columns.query( QueryOp( set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.as_dict()))) def get_columns(self, table_name, column_name=None, fail_when_not_found=False): """ RETURN METADATA COLUMNS """ try: with self.meta.columns.locker: columns = [ c for c in self.meta.columns.data if c.table == table_name and ( column_name is None or c.name == column_name) ] if columns: columns = jx.sort(columns, "name") if fail_when_not_found: # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all( columns.get("last_updated")): Log.note( "waiting for columns to update {{columns|json}}", columns=[ c.table + "." + c.es_column for c in columns if not c.last_updated ]) Thread.sleep(seconds=1) return columns elif all(columns.get("last_updated")): return columns except Exception, e: Log.error("Not expected", cause=e) if fail_when_not_found: if column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=column_name) else: self._get_columns(table=table_name) Log.error("no columns for {{table}}", table=table_name) self._get_columns(table=join_field(split_field(table_name)[0:1])) return self.get_columns(table_name=table_name, column_name=column_name, fail_when_not_found=True)
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): global singlton if singlton: return singlton else: singlton = object.__new__(cls) return singlton @use_settings def __init__(self, host, index, alias=None, name=None, port=9200, settings=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.lists import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = settings self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(settings=settings) self.todo = Queue("refresh metadata", max=100000, unique=True) self.meta=Dict() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer("meta.tables", [], wrap({c.name: c for c in table_columns})) self.meta.columns = ListContainer("meta.columns", [], wrap({c.name: c for c in column_columns})) self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return self.meta.tables.query({"where": {"eq": {"name": table_name}}}) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = [r for r in self.meta.columns.data if r.table == c.table and r.name == c.name] if not existing_columns: self.meta.columns.add(c) Log.note("todo: {{table}}.{{column}}", table=c.table, column=c.es_column) self.todo.add(c) # MARK meta.columns AS DIRTY TOO cols = [r for r in self.meta.columns.data if r.table == "meta.columns"] for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical.relative and not c.relative: return # RELATIVE COLUMNS WILL SHADOW ABSOLUTE COLUMNS for key in Column.__slots__: canonical[key] = c[key] Log.note("todo: {{table}}.{{column}}", table=canonical.table, column=canonical.es_column) self.todo.add(canonical) def _get_columns(self, table=None, metadata=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE if not metadata: metadata = self.default_es.get_metadata(force=True) def parse_all(please_stop): for abs_index, meta in jx.sort(metadata.indices.items(), {"value": 0, "sort": -1}): if meta.index != abs_index: continue for _, properties in meta.mappings.items(): if please_stop: return self._parse_properties(abs_index, properties, meta) if table: for abs_index, meta in jx.sort(metadata.indices.items(), {"value": 0, "sort": -1}): if table == meta.index: for _, properties in meta.mappings.items(): self._parse_properties(abs_index, properties, meta) return if table == abs_index: self._get_columns(table=meta.index, metadata=metadata) return else: self.parser = Thread.run("parse properties", parse_all) def _parse_properties(self, abs_index, properties, meta): abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column.startswith("previous_values.cf_") and not r.es_index.startswith("debug") ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): def add_column(c, query_path): c.last_updated = Date.now() if query_path: c.table = c.es_index + "." + query_path.last() else: c.table = c.es_index with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) if query_path: c.table = alias + "." + query_path.last() else: c.table = alias self._upsert_column(c) # EACH query_path IS A LIST OF EVER-INCREASING PATHS THROUGH EACH NESTED LEVEL query_paths = wrap([[c.es_column] for c in abs_columns if c.type == "nested"]) for a, b in itertools.product(query_paths, query_paths): aa = a.last() bb = b.last() if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) < len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(0, aa) break query_paths.append([]) for c in abs_columns: # ADD RELATIVE COLUMNS full_path = listwrap(c.nested_path) abs_depth = len(full_path) abs_parent = coalesce(full_path.last(), "") for query_path in query_paths: rel_depth = len(query_path) # ABSOLUTE add_column(copy(c), query_path) cc = copy(c) cc.relative = True if not query_path: add_column(cc, query_path) continue rel_parent = query_path.last() if c.es_column.startswith(rel_parent+"."): cc.name = c.es_column[len(rel_parent)+1:] add_column(cc, query_path) elif c.es_column == rel_parent: cc.name = "." add_column(cc, query_path) elif not abs_parent: # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o) # AND THEN REMOVE THE SHADOWED cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column add_column(cc, query_path) elif rel_parent.startswith(abs_parent+"."): cc.name = "." + ("." * (rel_depth - abs_depth)) + c.es_column add_column(cc, query_path) elif rel_parent != abs_parent: # SIBLING NESTED PATHS ARE INVISIBLE pass else: Log.error("logic error") def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.as_dict() ))) def get_columns(self, table_name, column_name=None, fail_when_not_found=False): """ RETURN METADATA COLUMNS """ try: with self.meta.columns.locker: columns = [c for c in self.meta.columns.data if c.table == table_name and (column_name is None or c.name==column_name)] if columns: columns = jx.sort(columns, "name") if fail_when_not_found: # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated]) Thread.sleep(seconds=1) return columns elif all(columns.get("last_updated")): return columns except Exception, e: Log.error("Not expected", cause=e) if fail_when_not_found: if column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=column_name) else: self._get_columns(table=table_name) Log.error("no columns for {{table}}", table=table_name) self._get_columns(table=join_field(split_field(table_name)[0:1])) return self.get_columns(table_name=table_name, column_name=column_name, fail_when_not_found=True)
class FromESMetadata(Schema): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): global singlton if singlton: return singlton else: singlton = object.__new__(cls) return singlton @use_settings def __init__(self, host, index, alias=None, name=None, port=9200, settings=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.list_usingPythonList import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = settings self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(settings=settings) self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.last_es_metadata = Date.now()-OLD_METADATA self.meta=Dict() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer("meta.tables", [], wrap({c.name: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.meta.tables.locker: return wrap([t for t in self.meta.tables.data if t.name == table_name]) def _upsert_column(self, c): # ASSUMING THE self.meta.columns.locker IS HAD existing_columns = self.meta.columns.find(c.table, c.name) if not existing_columns: self.meta.columns.add(c) self.todo.add(c) if ENABLE_META_SCAN: Log.note("todo: {{table}}::{{column}}", table=c.table, column=c.es_column) # MARK meta.columns AS DIRTY TOO cols = self.meta.columns.find("meta.columns", None) for cc in cols: cc.partitions = cc.cardinality = None cc.last_updated = Date.now() self.todo.extend(cols) else: canonical = existing_columns[0] if canonical.relative and not c.relative: return # RELATIVE COLUMNS WILL SHADOW ABSOLUTE COLUMNS for key in Column.__slots__: canonical[key] = c[key] Log.note("todo: {{table}}::{{column}}", table=canonical.table, column=canonical.es_column) self.todo.add(canonical) def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE meta = self.es_metadata.indices[table] if not meta or self.last_es_metadata < Date.now() - OLD_METADATA: self.es_metadata = self.default_es.get_metadata(force=True) meta = self.es_metadata.indices[table] for _, properties in meta.mappings.items(): self._parse_properties(meta.index, properties, meta) def _parse_properties(self, abs_index, properties, meta): abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties) abs_columns = abs_columns.filter( # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED lambda r: not r.es_column.startswith("other.") and not r.es_column.startswith("previous_values.cf_") and not r.es_index.startswith("debug") and r.es_column.find("=")==-1 and r.es_column.find(" ")==-1 ) with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG): def add_column(c, query_path): c.last_updated = Date.now() c.table = join_field([c.es_index]+split_field(query_path[0])) with self.meta.columns.locker: self._upsert_column(c) for alias in meta.aliases: c = copy(c) c.table = join_field([alias]+split_field(query_path[0])) self._upsert_column(c) # LIST OF EVERY NESTED PATH query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"] for a, b in itertools.product(query_paths, query_paths): aa = a[0] bb = b[0] if aa and bb.startswith(aa): for i, b_prefix in enumerate(b): if len(b_prefix) > len(aa): continue if aa == b_prefix: break # SPLIT ALREADY FOUND b.insert(i, aa) break for q in query_paths: q.append(".") query_paths.append(ROOT_PATH) # ADD RELATIVE COLUMNS for abs_column in abs_columns: full_path = abs_column.nested_path abs_depth = len(full_path)-1 abs_parent = full_path[1] if abs_depth else "" for query_path in query_paths: rel_depth = len(query_path)-1 rel_parent = query_path[0] rel_column = copy(abs_column) rel_column.relative = True add_column(copy(abs_column), query_path) if rel_parent == ".": add_column(rel_column, query_path) elif abs_column.es_column.startswith(rel_parent+"."): rel_column.name = abs_column.es_column[len(rel_parent)+1:] add_column(rel_column, query_path) elif abs_column.es_column == rel_parent: rel_column.name = "." add_column(rel_column, query_path) elif not abs_parent: # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o) # AND THEN REMOVE THE SHADOWED rel_column.name = "." + ("." * (rel_depth - abs_depth)) + abs_column.es_column add_column(rel_column, query_path) elif rel_parent.startswith(abs_parent+"."): rel_column.name = "." + ("." * (rel_depth - abs_depth)) + abs_column.es_column add_column(rel_column, query_path) elif rel_parent != abs_parent: # SIBLING NESTED PATHS ARE INVISIBLE pass else: Log.error("logic error") def query(self, _query): return self.meta.columns.query(QueryOp(set_default( { "from": self.meta.columns, "sort": ["table", "name"] }, _query.as_dict() ))) def get_columns(self, table_name, column_name=None, force=False): """ RETURN METADATA COLUMNS """ try: # LAST TIME WE GOT INFO FOR THIS TABLE short_name = join_field(split_field(table_name)[0:1]) table = self.get_table(short_name)[0] if not table: table = Table( name=short_name, url=None, query_path=None, timestamp=Date.now() ) with self.meta.tables.locker: self.meta.tables.add(table) self._get_columns(table=short_name) elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE: table.timestamp = Date.now() self._get_columns(table=short_name) with self.meta.columns.locker: columns = self.meta.columns.find(table_name, column_name) if columns: columns = jx.sort(columns, "name") # AT LEAST WAIT FOR THE COLUMNS TO UPDATE while len(self.todo) and not all(columns.get("last_updated")): Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated]) Thread.sleep(seconds=1) return columns except Exception, e: Log.error("Not expected", cause=e) if column_name: Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=column_name) else: self._get_columns(table=table_name) Log.error("no columns for {{table}}?!", table=table_name)
class Sqlite(object): canonical = None def __init__(self, db=None): """ :param db: Optional, wrap a sqlite db in a thread :return: Multithread save database """ self.db = None self.queue = Queue( "sql commands") # HOLD (command, result, signal) PAIRS self.worker = Thread.run("sqlite db thread", self._worker) self.get_trace = DEBUG def execute(self, command): """ COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS :param command: COMMAND FOR SQLITE :return: None """ if self.get_trace: trace = extract_stack(1) else: trace = None self.queue.add((command, None, None, trace)) def query(self, command): """ WILL STALL CALLING THREAD UNTIL THE command IS COMPLETED :param command: COMMAND FOR SQLITE :return: list OF RESULTS """ signal = Signal() result = Dict() self.queue.add((command, result, signal, None)) signal.wait_for_go() if result.exception: Log.error("Problem with Sqlite call", cause=result.exception) return result def _worker(self, please_stop): if Sqlite.canonical: self.db = Sqlite.canonical else: self.db = sqlite3.connect(':memory:') try: while not please_stop: if DEBUG: Log.note("begin pop") command, result, signal, trace = self.queue.pop() if DEBUG: Log.note("done pop") if DEBUG: Log.note("Running command\n{{command|indent}}", command=command) with Timer("Run command", debug=DEBUG): if signal is not None: try: curr = self.db.execute(command) result.meta.format = "table" result.data = curr.fetchall() except Exception, e: e = Except.wrap(e) result.exception = Except( ERROR, "Problem with\n{{command|indent}}", command=command, cause=e) finally: signal.go()
class Sqlite(object): """ Allows multi-threaded access Loads extension functions (like SQRT) """ canonical = None def __init__(self, db=None): """ :param db: Optional, wrap a sqlite db in a thread :return: Multithread save database """ if not _upgraded: _upgrade() self.db = None self.queue = Queue( "sql commands") # HOLD (command, result, signal) PAIRS self.worker = Thread.run("sqlite db thread", self._worker) self.get_trace = DEBUG def execute(self, command): """ COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS :param command: COMMAND FOR SQLITE :return: None """ if self.get_trace: trace = extract_stack(1) else: trace = None self.queue.add((command, None, None, trace)) def query(self, command): """ WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED :param command: COMMAND FOR SQLITE :return: list OF RESULTS """ signal = Signal() result = Dict() self.queue.add((command, result, signal, None)) signal.wait_for_go() if result.exception: Log.error("Problem with Sqlite call", cause=result.exception) return result def _worker(self, please_stop): if Sqlite.canonical: self.db = Sqlite.canonical else: self.db = sqlite3.connect(':memory:') try: full_path = File( "pyLibrary/vendor/sqlite/libsqlitefunctions.so").abspath # self.db.execute("SELECT sqlite3_enable_load_extension(1)") self.db.enable_load_extension(True) self.db.execute("SELECT load_extension('" + full_path + "')") except Exception, e: Log.warning( "loading sqlite extension functions failed, doing without. (no SQRT for you!)", cause=e) try: while not please_stop: if DEBUG: Log.note("begin pop") command, result, signal, trace = self.queue.pop() if DEBUG: Log.note("done pop") if DEBUG: Log.note("Running command\n{{command|indent}}", command=command) with Timer("Run command", debug=DEBUG): if signal is not None: try: curr = self.db.execute(command) result.meta.format = "table" result.header = [d[0] for d in curr.description ] if curr.description else None result.data = curr.fetchall() except Exception, e: e = Except.wrap(e) result.exception = Except( ERROR, "Problem with\n{{command|indent}}", command=command, cause=e) finally:
class Sqlite(object): canonical = None def __init__(self, db=None): """ :param db: Optional, wrap a sqlite db in a thread :return: Multithread save database """ self.db = None self.queue = Queue("sql commands") # HOLD (command, result, signal) PAIRS self.worker = Thread.run("sqlite db thread", self._worker) self.get_trace = DEBUG def execute(self, command): """ COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS :param command: COMMAND FOR SQLITE :return: None """ if self.get_trace: trace = extract_stack(1) else: trace = None self.queue.add((command, None, None, trace)) def query(self, command): """ WILL STALL CALLING THREAD UNTIL THE command IS COMPLETED :param command: COMMAND FOR SQLITE :return: list OF RESULTS """ signal = Signal() result = Dict() self.queue.add((command, result, signal, None)) signal.wait_for_go() if result.exception: Log.error("Problem with Sqlite call", cause=result.exception) return result def _worker(self, please_stop): if Sqlite.canonical: self.db = Sqlite.canonical else: self.db = sqlite3.connect(':memory:') try: while not please_stop: if DEBUG: Log.note("begin pop") command, result, signal, trace = self.queue.pop() if DEBUG: Log.note("done pop") if DEBUG: Log.note("Running command\n{{command|indent}}", command=command) with Timer("Run command", debug=DEBUG): if signal is not None: try: curr = self.db.execute(command) result.meta.format = "table" result.data = curr.fetchall() except Exception, e: e=Except.wrap(e) result.exception = Except(ERROR, "Problem with\n{{command|indent}}", command=command, cause=e) finally: signal.go()
def loop(source, coverage_summary_index, settings, please_stop): try: cluster = elasticsearch.Cluster(source) aliases = cluster.get_aliases() candidates = [] for pairs in aliases: if pairs.alias == source.index: candidates.append(pairs.index) candidates = jx.sort(candidates, {".": "desc"}) for index_name in candidates: coverage_index = elasticsearch.Index(index=index_name, read_only=False, settings=source) push_date_filter = unicode2Date(coverage_index.settings.index[-15::], elasticsearch.INDEX_DATE_FORMAT) while not please_stop: # IDENTIFY NEW WORK Log.note("Working on index {{index}}", index=index_name) coverage_index.refresh() todo = http.post_json(settings.url, json={ "from": "coverage", "groupby": ["source.file.name", "build.revision12"], "where": {"and": [ {"missing": "source.method.name"}, {"missing": "source.file.min_line_siblings"}, {"gte": {"repo.push.date": push_date_filter}} ]}, "format": "list", "limit": coalesce(settings.batch_size, 100) }) if not todo.data: break queue = Queue("pending source files to review") queue.extend(todo.data[0:coalesce(settings.batch_size, 100):]) threads = [ Thread.run( "processor" + unicode(i), process_batch, queue, coverage_index, coverage_summary_index, settings, please_stop=please_stop ) for i in range(NUM_THREAD) ] # ADD STOP MESSAGE queue.add(Thread.STOP) # WAIT FOR THEM TO COMPLETE for t in threads: t.join() please_stop.go() return except Exception, e: Log.warning("Problem processing", cause=e)
class FromESMetadata(object): """ QUERY THE METADATA """ def __new__(cls, *args, **kwargs): global singlton if singlton: return singlton else: singlton = object.__new__(cls) return singlton @use_settings def __init__(self, host, index, alias=None, name=None, port=9200, settings=None): global _elasticsearch if hasattr(self, "settings"): return from pyLibrary.queries.containers.lists import ListContainer from pyLibrary.env import elasticsearch as _elasticsearch self.settings = settings self.default_name = coalesce(name, alias, index) self.default_es = _elasticsearch.Cluster(settings=settings) self.todo = Queue("refresh metadata", max=100000, unique=True) table_columns = metadata_tables() column_columns = metadata_columns() self.tables = ListContainer("meta.tables", [], wrap({c.name: c for c in table_columns})) self.columns = ListContainer("meta.columns", [], wrap({c.name: c for c in column_columns})) self.columns.insert(column_columns) self.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return @property def query_path(self): return None @property def url(self): return self.default_es.path + "/" + self.default_name.replace(".", "/") def get_table(self, table_name): with self.tables.locker: return self.tables.query({"where": {"eq": {"name": table_name}}}) def upsert_column(self, c): existing_columns = filter(lambda r: r.table == c.table and r.abs_name == c.abs_name, self.columns.data) if not existing_columns: self.columns.add(c) cols = filter(lambda r: r.table == "meta.columns", self.columns.data) for cc in cols: cc.partitions = cc.cardinality = cc.last_updated = None self.todo.add(c) self.todo.extend(cols) else: set_default(existing_columns[0], c) self.todo.add(existing_columns[0]) # TEST CONSISTENCY for c, d in product(list(self.todo.queue), list(self.todo.queue)): if c.abs_name==d.abs_name and c.table==d.table and c!=d: Log.error("") def _get_columns(self, table=None): # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE alias_done = set() index = split_field(table)[0] query_path = split_field(table)[1:] metadata = self.default_es.get_metadata(index=index) for index, meta in qb.sort(metadata.indices.items(), {"value": 0, "sort": -1}): for _, properties in meta.mappings.items(): columns = _elasticsearch.parse_properties(index, None, properties.properties) columns = columns.filter(lambda r: not r.abs_name.startswith("other.") and not r.abs_name.startswith("previous_values.cf_")) # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED with Timer("upserting {{num}} columns", {"num": len(columns)}, debug=DEBUG): with self.columns.locker: for c in columns: # ABSOLUTE c.table = join_field([index]+query_path) self.upsert_column(c) for alias in meta.aliases: # ONLY THE LATEST ALIAS IS CHOSEN TO GET COLUMNS if alias in alias_done: continue alias_done.add(alias) c = copy(c) c.table = join_field([alias]+query_path) self.upsert_column(c) def query(self, _query): return self.columns.query(Query(set_default( { "from": self.columns, "sort": ["table", "name"] }, _query.as_dict() ))) def get_columns(self, table): """ RETURN METADATA COLUMNS """ with self.columns.locker: columns = qb.sort(filter(lambda r: r.table == table, self.columns.data), "name") if columns: return columns self._get_columns(table=table) with self.columns.locker: columns = qb.sort(filter(lambda r: r.table == table, self.columns.data), "name") if columns: return columns # self._get_columns(table=table) Log.error("no columns for {{table}}", table=table) def _update_cardinality(self, c): """ QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN """ if c.type in ["object", "nested"]: Log.error("not supported") try: if c.table == "meta.columns": with self.columns.locker: partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.columns, c.abs_name) if g[c.abs_name] != None]) self.columns.update({ "set": { "partitions": partitions, "count": len(self.columns), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "abs_name": c.abs_name}} }) return if c.table == "meta.tables": with self.columns.locker: partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.tables, c.abs_name) if g[c.abs_name] != None]) self.columns.update({ "set": { "partitions": partitions, "count": len(self.tables), "cardinality": len(partitions), "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "name": c.name}} }) return es_index = c.table.split(".")[0] result = self.default_es.post("/"+es_index+"/_search", data={ "aggs": {c.name: _counting_query(c)}, "size": 0 }) r = result.aggregations.values()[0] count = result.hits.total cardinality = coalesce(r.value, r._nested.value) if cardinality == None: Log.error("logic error") query = Dict(size=0) if c.type in ["object", "nested"]: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.columns.locker: self.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"table": c.table, "name": c.name}} }) return elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99): Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.columns.locker: self.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"table": c.table, "name": c.name}} }) return elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30: Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality) with self.columns.locker: self.columns.update({ "set": { "count": count, "cardinality": cardinality, "last_updated": Date.now() }, "clear": ["partitions"], "where": {"eq": {"table": c.table, "name": c.name}} }) return elif c.nested_path: query.aggs[literal_field(c.name)] = { "nested": {"path": listwrap(c.nested_path)[0]}, "aggs": {"_nested": {"terms": {"field": c.abs_name, "size": 0}}} } else: query.aggs[literal_field(c.name)] = {"terms": {"field": c.abs_name, "size": 0}} result = self.default_es.post("/"+es_index+"/_search", data=query) aggs = result.aggregations.values()[0] if aggs._nested: parts = qb.sort(aggs._nested.buckets.key) else: parts = qb.sort(aggs.buckets.key) Log.note("{{field}} has {{parts}}", field=c.name, parts=parts) with self.columns.locker: self.columns.update({ "set": { "count": count, "cardinality": cardinality, "partitions": parts, "last_updated": Date.now() }, "where": {"eq": {"table": c.table, "abs_name": c.abs_name}} }) except Exception, e: if "IndexMissingException" in e and c.table.startswith("testing"): Log.alert("{{col.table}} does not exist", col=c) else: self.columns.update({ "set": { "last_updated": Date.now() }, "clear":[ "count", "cardinality", "partitions", ], "where": {"eq": {"table": c.table, "abs_name": c.abs_name}} }) Log.warning("Could not get {{col.table}}.{{col.abs_name}} info", col=c, cause=e)