Exemplo n.º 1
0
    def __init__(self, rate=None, amortization_period=None, source=None, database=None, kwargs=None):
        self.amortization_period = coalesce(amortization_period, AMORTIZATION_PERIOD)
        self.rate = coalesce(rate, HG_REQUEST_PER_SECOND)
        self.cache_locker = Lock()
        self.cache = {}  # MAP FROM url TO (ready, headers, response, timestamp) PAIR
        self.no_cache = {}  # VERY SHORT TERM CACHE
        self.workers = []
        self.todo = Queue(APP_NAME+" todo")
        self.requests = Queue(APP_NAME + " requests", max=int(self.rate * self.amortization_period.seconds))
        self.url = URL(source.url)
        self.db = Sqlite(database)
        self.inbound_rate = RateLogger("Inbound")
        self.outbound_rate = RateLogger("hg.mo")

        if not self.db.query("SELECT name FROM sqlite_master WHERE type='table'").data:
            with self.db.transaction() as t:
                t.execute(
                    "CREATE TABLE cache ("
                    "   path TEXT PRIMARY KEY, "
                    "   headers TEXT, "
                    "   response TEXT, "
                    "   timestamp REAL "
                    ")"
                )

        self.threads = [
            Thread.run(APP_NAME+" worker" + text_type(i), self._worker)
            for i in range(CONCURRENCY)
        ]
        self.limiter = Thread.run(APP_NAME+" limiter", self._rate_limiter)
        self.cleaner = Thread.run(APP_NAME+" cleaner", self._cache_cleaner)
Exemplo n.º 2
0
    def _find_revision(self, revision):
        please_stop = False
        locker = Lock()
        output = []
        queue = Queue("branches", max=2000)
        queue.extend(b for b in self.branches if b.locale == DEFAULT_LOCALE and b.name in ["try", "mozilla-inbound", "autoland"])
        queue.add(THREAD_STOP)

        problems = []
        def _find(please_stop):
            for b in queue:
                if please_stop:
                    return
                try:
                    url = b.url + "json-info?node=" + revision
                    rev = self.get_revision(Revision(branch=b, changeset={"id": revision}))
                    with locker:
                        output.append(rev)
                    Log.note("Revision found at {{url}}", url=url)
                except Exception as f:
                    problems.append(f)

        threads = []
        for i in range(3):
            threads.append(Thread.run("find changeset " + text_type(i), _find, please_stop=please_stop))

        for t in threads:
            with assert_no_exception:
                t.join()

        return output
Exemplo n.º 3
0
    def __init__(self, rate=None, amortization_period=None, source=None, database=None, kwargs=None):
        self.amortization_period = coalesce(amortization_period, AMORTIZATION_PERIOD)
        self.rate = coalesce(rate, HG_REQUEST_PER_SECOND)
        self.cache_locker = Lock()
        self.cache = {}  # MAP FROM url TO (ready, headers, response, timestamp) PAIR
        self.no_cache = {}  # VERY SHORT TERM CACHE
        self.workers = []
        self.todo = Queue(APP_NAME+" todo")
        self.requests = Queue(APP_NAME + " requests", max=int(self.rate * self.amortization_period.seconds))
        self.url = URL(source.url)
        self.db = Sqlite(database)
        self.inbound_rate = RateLogger("Inbound")
        self.outbound_rate = RateLogger("hg.mo")

        if not self.db.query("SELECT name FROM sqlite_master WHERE type='table'").data:
            with self.db.transaction() as t:
                t.execute(
                    "CREATE TABLE cache ("
                    "   path TEXT PRIMARY KEY, "
                    "   headers TEXT, "
                    "   response TEXT, "
                    "   timestamp REAL "
                    ")"
                )

        self.threads = [
            Thread.run(APP_NAME+" worker" + text_type(i), self._worker)
            for i in range(CONCURRENCY)
        ]
        self.limiter = Thread.run(APP_NAME+" limiter", self._rate_limiter)
        self.cleaner = Thread.run(APP_NAME+" cleaner", self._cache_cleaner)
Exemplo n.º 4
0
    def __init__(self, stream):
        assert stream

        if is_text(stream):
            name = stream
            stream = self.stream = eval(stream)
            if name.startswith("sys.") and PY3:
                self.stream = Data(write=lambda d: stream.write(d.decode('utf8')))
        else:
            name = "stream"
            self.stream = stream

        # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD
        from mo_threads import Queue

        def utf8_appender(value):
            if is_text(value):
                value = value.encode('utf8')
            self.stream.write(value)

        appender = utf8_appender

        self.queue = Queue("queue for " + self.__class__.__name__ + "(" + name + ")", max=10000, silent=True)
        self.thread = Thread("log to " + self.__class__.__name__ + "(" + name + ")", time_delta_pusher, appender=appender, queue=self.queue, interval=0.3)
        self.thread.parent.remove_child(self.thread)  # LOGGING WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.start()
    def __init__(
        self,
        host,
        index,
        port=9200,
        type="log",
        queue_size=1000,
        batch_size=100,
        kwargs=None,
    ):
        """
        settings ARE FOR THE ELASTICSEARCH INDEX
        """
        kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds
        kwargs.retry.times = coalesce(kwargs.retry.times, 3)
        kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds

        self.es = Cluster(kwargs).get_or_create_index(
            schema=json2value(value2json(SCHEMA), leaves=True),
            limit_replicas=True,
            typed=True,
            kwargs=kwargs,
        )
        self.batch_size = batch_size
        self.es.add_alias(coalesce(kwargs.alias, kwargs.index))
        self.queue = Queue("debug logs to es", max=queue_size, silent=True)

        self.worker = Thread.run("add debug logs to es", self._insert_loop)
    def __init__(self, logger):
        if not isinstance(logger, StructuredLogger):
            Log.error("Expecting a StructuredLogger")

        self.queue = Queue("Queue for " + self.__class__.__name__,
                           max=10000,
                           silent=True,
                           allow_add_after_close=True)
        self.logger = logger

        def worker(logger, please_stop):
            try:
                while not please_stop:
                    Till(seconds=1).wait()
                    logs = self.queue.pop_all()
                    for log in logs:
                        if log is THREAD_STOP:
                            please_stop.go()
                        else:
                            logger.write(**log)
            finally:
                logger.stop()

        self.thread = Thread("Thread for " + self.__class__.__name__, worker,
                             logger)
        self.thread.parent.remove_child(
            self.thread)  # LOGGING WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.start()
Exemplo n.º 7
0
class StructuredLogger_usingQueue(StructuredLogger):
    def __init__(self, name=None):
        queue_name = "log messages to queue"
        if name:
            queue_name += " " + name
        self.queue = Queue(queue_name)

    def write(self, template, params):
        self.queue.add(expand_template(template, params))

    def stop(self):
        self.queue.close()

    def pop(self):
        lines = self.queue.pop()
        output = []
        for l in lines.split(CR):
            # REMOVE FIRST PART, THE TIMESTAMP
            # 0123456789012345678901234567890
            # 2019-01-06 19:13:49.937542 -
            prefix = re.match(DATE_PATTERN, l)
            if prefix:
                l = l[len(prefix.group(0)):]
            if not l.strip():
                continue
            if l.strip().startswith("File"):
                continue
            output.append(l)
        return CR.join(output).strip()
Exemplo n.º 8
0
    def __init__(self, stream):
        assert stream

        use_UTF8 = False

        if isinstance(stream, basestring):
            if stream.startswith("sys."):
                use_UTF8 = True  # sys.* ARE OLD AND CAN NOT HANDLE unicode
            self.stream = eval(stream)
            name = stream
        else:
            self.stream = stream
            name = "stream"

        # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD
        from mo_threads import Queue

        if use_UTF8:
            def utf8_appender(value):
                if isinstance(value, unicode):
                    value = value.encode('utf8')
                self.stream.write(value)

            appender = utf8_appender
        else:
            appender = self.stream.write

        self.queue = Queue("queue for " + self.__class__.__name__ + "(" + name + ")", max=10000, silent=True)
        self.thread = Thread("log to " + self.__class__.__name__ + "(" + name + ")", time_delta_pusher, appender=appender, queue=self.queue, interval=0.3)
        self.thread.parent.remove_child(self.thread)  # LOGGING WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.start()
class StructuredLogger_usingThread(StructuredLogger):
    def __init__(self, logger, period=PERIOD):
        if not isinstance(logger, StructuredLogger):
            Log.error("Expecting a StructuredLogger")

        self.logger = logger
        self.queue = Queue(
            "Queue for " + self.__class__.__name__,
            max=10000,
            silent=True,
            allow_add_after_close=True,
        )
        self.thread = Thread("Thread for " + self.__class__.__name__, worker,
                             logger, self.queue, period)
        # worker WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.parent.remove_child(self.thread)
        self.thread.start()

    def write(self, template, params):
        try:
            self.queue.add({"template": template, "params": params})
            return self
        except Exception as e:
            e = Except.wrap(e)
            raise e  # OH NO!

    def stop(self):
        try:
            self.queue.add(
                THREAD_STOP)  # BE PATIENT, LET REST OF MESSAGE BE SENT
            self.thread.join()
        except Exception as e:
            Log.note("problem in threaded logger" + str(e))
Exemplo n.º 10
0
    def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(kwargs=kwargs)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.last_es_metadata = Date.now()-OLD_METADATA

        self.meta=Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Exemplo n.º 11
0
    def __init__(self, conn=None, tuid_service=None, start_workers=True, new_table=False, kwargs=None):
        try:
            self.config = kwargs
            self.conn = conn if conn else sql.Sql(self.config.database.name)
            self.hg_cache = HgMozillaOrg(kwargs=self.config.hg_cache, use_cache=True) if self.config.hg_cache else Null

            self.tuid_service = tuid_service if tuid_service else tuid.service.TUIDService(
                kwargs=self.config.tuid, conn=self.conn, clogger=self
            )
            self.rev_locker = Lock()
            self.working_locker = Lock()

            if new_table:
                with self.conn.transaction() as t:
                    t.execute("DROP TABLE IF EXISTS csetLog")

            self.init_db()
            self.next_revnum = coalesce(self.conn.get_one("SELECT max(revnum)+1 FROM csetLog")[0], 1)
            self.csets_todo_backwards = Queue(name="Clogger.csets_todo_backwards")
            self.deletions_todo = Queue(name="Clogger.deletions_todo")
            self.maintenance_signal = Signal(name="Clogger.maintenance_signal")

            if 'tuid' in self.config:
                self.config = self.config.tuid

            self.disable_backfilling = False
            self.disable_tipfilling = False
            self.disable_deletion = False
            self.disable_maintenance = False

            self.backfill_thread = None
            self.tipfill_thread = None
            self.deletion_thread = None
            self.maintenance_thread = None

            # Make sure we are filled before allowing queries
            numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0]
            if numrevs < MINIMUM_PERMANENT_CSETS:
                Log.note("Filling in csets to hold {{minim}} csets.", minim=MINIMUM_PERMANENT_CSETS)
                oldest_rev = 'tip'
                with self.conn.transaction() as t:
                    tmp = t.query("SELECT min(revnum), revision FROM csetLog").data[0][1]
                    if tmp:
                        oldest_rev = tmp
                self._fill_in_range(
                    MINIMUM_PERMANENT_CSETS - numrevs,
                    oldest_rev,
                    timestamp=False
                )

            Log.note(
                "Table is filled with atleast {{minim}} entries.",
                minim=MINIMUM_PERMANENT_CSETS
            )

            if start_workers:
                self.start_workers()
        except Exception as e:
            Log.warning("Cannot setup clogger: {{cause}}", cause=str(e))
Exemplo n.º 12
0
def update_local_database(config, deviant_summary, candidates, since):
    if isinstance(deviant_summary, bigquery.Table):
        Log.note("Only the ETL process should fill the bigquery table")
        return

    # GET EVERYTHING WE HAVE SO FAR
    exists = deviant_summary.query({
        "select": ["signature_hash", "last_updated"],
        "where": {
            "and": [
                {
                    "in": {
                        "signature_hash": candidates.signature_hash
                    }
                },
                {
                    "exists": "num_pushes"
                },
            ]
        },
        "sort": "last_updated",
        "limit": 100000,
        "format": "list",
    }).data
    # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT"
    missing = list(set(candidates.signature_hash) - set(exists.signature_hash))

    too_old = Date.today() - parse(LOCAL_RETENTION)
    needs_update = missing + [
        e.signature_hash for e in exists if e.last_updated < too_old.unix
    ]
    Log.alert("{{num}} series are candidates for local update",
              num=len(needs_update))

    limited_update = Queue("sigs")
    limited_update.extend(
        left(needs_update, coalesce(config.display.download_limit, 100)))
    Log.alert("Updating local database with {{num}} series",
              num=len(limited_update))

    with Timer("Updating local database"):

        def loop(please_stop):
            while not please_stop:
                signature_hash = limited_update.pop_one()
                if not signature_hash:
                    return
                process(
                    signature_hash,
                    since,
                    source=config.database,
                    deviant_summary=deviant_summary,
                )

        threads = [Thread.run(text(i), loop) for i in range(3)]
        for t in threads:
            t.join()

    Log.note("Local database is up to date")
Exemplo n.º 13
0
    def __init__(self, conn=None, tuid_service=None, kwargs=None):
        try:
            self.config = kwargs

            self.conn = conn if conn else sql.Sql(self.config.database.name)
            self.hg_cache = HgMozillaOrg(
                kwargs=self.config.hg_cache,
                use_cache=True) if self.config.hg_cache else Null

            self.tuid_service = tuid_service if tuid_service else tuid.service.TUIDService(
                database=None,
                hg=None,
                kwargs=self.config,
                conn=self.conn,
                clogger=self)
            self.rev_locker = Lock()
            self.working_locker = Lock()

            self.init_db()
            self.next_revnum = coalesce(
                self.conn.get_one("SELECT max(revnum)+1 FROM csetLog")[0], 1)
            self.csets_todo_backwards = Queue(
                name="Clogger.csets_todo_backwards")
            self.deletions_todo = Queue(name="Clogger.deletions_todo")
            self.maintenance_signal = Signal(name="Clogger.maintenance_signal")
            self.config = self.config.tuid

            self.disable_backfilling = False
            self.disable_tipfilling = False
            self.disable_deletion = False
            self.disable_maintenance = False

            # Make sure we are filled before allowing queries
            numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0]
            if numrevs < MINIMUM_PERMANENT_CSETS:
                Log.note("Filling in csets to hold {{minim}} csets.",
                         minim=MINIMUM_PERMANENT_CSETS)
                oldest_rev = 'tip'
                with self.conn.transaction() as t:
                    tmp = t.query(
                        "SELECT min(revnum), revision FROM csetLog").data[0][1]
                    if tmp:
                        oldest_rev = tmp
                self._fill_in_range(MINIMUM_PERMANENT_CSETS - numrevs,
                                    oldest_rev,
                                    timestamp=False)

            Log.note(
                "Table is filled with atleast {{minim}} entries. Starting workers...",
                minim=MINIMUM_PERMANENT_CSETS)

            Thread.run('clogger-tip', self.fill_forward_continuous)
            Thread.run('clogger-backfill', self.fill_backward_with_list)
            Thread.run('clogger-maintenance', self.csetLog_maintenance)
            Thread.run('clogger-deleter', self.csetLog_deleter)

            Log.note("Started clogger workers.")
        except Exception as e:
            Log.warning("Cannot setup clogger: {{cause}}", cause=str(e))
Exemplo n.º 14
0
    def __init__(
        self,
        host,
        index,
        port=9200,
        type="log",
        queue_size=1000,
        batch_size=100,
        refresh_interval="1second",
        kwargs=None,
    ):
        """
        settings ARE FOR THE ELASTICSEARCH INDEX
        """
        kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds
        kwargs.retry.times = coalesce(kwargs.retry.times, 3)
        kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep,
                                               MINUTE)).seconds
        kwargs.host = randoms.sample(listwrap(host), 1)[0]

        rollover_interval = coalesce(kwargs.rollover.interval,
                                     kwargs.rollover.max, "year")
        rollover_max = coalesce(kwargs.rollover.max, kwargs.rollover.interval,
                                "year")

        schema = set_default(
            kwargs.schema,
            {
                "mappings": {
                    kwargs.type: {
                        "properties": {
                            "~N~": {
                                "type": "nested"
                            }
                        }
                    }
                }
            },
            json2value(value2json(SCHEMA), leaves=True),
        )

        self.es = RolloverIndex(
            rollover_field={"get": [{
                "first": "."
            }, {
                "literal": "timestamp"
            }]},
            rollover_interval=rollover_interval,
            rollover_max=rollover_max,
            schema=schema,
            limit_replicas=True,
            typed=True,
            read_only=False,
            kwargs=kwargs,
        )
        self.batch_size = batch_size
        self.queue = Queue("debug logs to es", max=queue_size, silent=True)

        self.worker = Thread.run("add debug logs to es", self._insert_loop)
Exemplo n.º 15
0
    def __init__(self, filename=None, db=None):
        """
        :param db:  Optional, wrap a sqlite db in a thread
        :return: Multithread save database
        """
        if not _upgraded:
            _upgrade()

        self.filename = filename
        self.db = db
        self.queue = Queue("sql commands")   # HOLD (command, result, signal) PAIRS
        self.worker = Thread.run("sqlite db thread", self._worker)
        self.get_trace = DEBUG
Exemplo n.º 16
0
 def __init__(self, db):
     Table.__init__(self, META_COLUMNS_NAME)
     self.data = {}  # MAP FROM fact_name TO (abs_column_name to COLUMNS)
     self.locker = Lock()
     self._schema = None
     self.dirty = False
     self.db = db
     self.es_index = None
     self.last_load = Null
     self.todo = Queue(
         "update columns to es"
     )  # HOLD (action, column) PAIR, WHERE action in ['insert', 'update']
     self._snowflakes = Data()
     self._load_from_database()
Exemplo n.º 17
0
    def __init__(self,
                 host,
                 index,
                 sql_file='metadata.sqlite',
                 alias=None,
                 name=None,
                 port=9200,
                 kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)

        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.index_to_alias = Relation_usingList()

        self.es_metadata = Null
        self.metadata_last_updated = Date.now() - OLD_METADATA

        self.meta = Data()
        self.meta.columns = ColumnList()

        self.alias_to_query_paths = {
            "meta.columns": [['.']],
            "meta.tables": [['.']]
        }
        self.alias_last_updated = {
            "meta.columns": Date.now(),
            "meta.tables": Date.now()
        }
        table_columns = metadata_tables()
        self.meta.tables = ListContainer(
            "meta.tables",
            [
                # TableDesc("meta.columns", None, ".", Date.now()),
                # TableDesc("meta.tables", None, ".", Date.now())
            ],
            jx_base.Schema(".", table_columns))
        self.meta.columns.extend(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Exemplo n.º 18
0
    def __init__(self, filename=None, db=None, get_trace=None, upgrade=True, load_functions=False, kwargs=None):
        """
        :param filename:  FILE TO USE FOR DATABASE
        :param db: AN EXISTING sqlite3 DB YOU WOULD LIKE TO USE (INSTEAD OF USING filename)
        :param get_trace: GET THE STACK TRACE AND THREAD FOR EVERY DB COMMAND (GOOD FOR DEBUGGING)
        :param upgrade: REPLACE PYTHON sqlite3 DLL WITH MORE RECENT ONE, WITH MORE FUNCTIONS (NOT WORKING)
        :param load_functions: LOAD EXTENDED MATH FUNCTIONS (MAY REQUIRE upgrade)
        :param kwargs:
        """
        if upgrade and not _upgraded:
            _upgrade()

        self.settings = kwargs
        self.filename = File(filename).abspath
        if known_databases.get(self.filename):
            Log.error("Not allowed to create more than one Sqlite instance for {{file}}", file=self.filename)

        # SETUP DATABASE
        DEBUG and Log.note("Sqlite version {{version}}", version=sqlite3.sqlite_version)
        try:
            if db == None:
                self.db = sqlite3.connect(
                    database=coalesce(self.filename, ":memory:"),
                    check_same_thread=False,
                    isolation_level=None
                )
            else:
                self.db = db
        except Exception as e:
            Log.error("could not open file {{filename}}", filename=self.filename, cause=e)
        load_functions and self._load_functions()

        self.locker = Lock()
        self.available_transactions = []  # LIST OF ALL THE TRANSACTIONS BEING MANAGED
        self.queue = Queue("sql commands")   # HOLD (command, result, signal, stacktrace) TUPLES

        self.get_trace = coalesce(get_trace, TRACE)
        self.upgrade = upgrade
        self.closed = False

        # WORKER VARIABLES
        self.transaction_stack = []  # THE TRANSACTION OBJECT WE HAVE PARTIALLY RUN
        self.last_command_item = None  # USE THIS TO HELP BLAME current_transaction FOR HANGING ON TOO LONG
        self.too_long = None
        self.delayed_queries = []
        self.delayed_transactions = []
        self.worker = Thread.run("sqlite db thread", self._worker)

        DEBUG and Log.note("Sqlite version {{version}}", version=self.query("select sqlite_version()").data[0][0])
Exemplo n.º 19
0
 def __init__(self, name):
     Table.__init__(self, "meta.columns")
     self.db_file = File("metadata." + name + ".sqlite")
     self.data = {}  # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS)
     self.locker = Lock()
     self._schema = None
     self.db = sqlite3.connect(
         database=self.db_file.abspath, check_same_thread=False, isolation_level=None
     )
     self.last_load = Null
     self.todo = Queue(
         "update columns to db"
     )  # HOLD (action, column) PAIR, WHERE action in ['insert', 'update']
     self._db_load()
     Thread.run("update " + name, self._db_worker)
Exemplo n.º 20
0
    def __init__(self, logger):
        if not isinstance(logger, StructuredLogger):
            Log.error("Expecting a StructuredLogger")

        self.queue = Queue("Queue for " + self.__class__.__name__, max=10000, silent=True, allow_add_after_close=True)
        self.logger = logger

        def worker(logger, please_stop):
            try:
                while not please_stop:
                    logs = self.queue.pop_all()
                    if not logs:
                        (Till(seconds=1) | please_stop).wait()
                        continue
                    for log in logs:
                        if log is THREAD_STOP:
                            please_stop.go()
                        else:
                            logger.write(**log)
            except Exception as e:
                print("problem in " + StructuredLogger_usingThread.__name__ + ": " + str(e))
            finally:
                Log.note("stop the child")
                logger.stop()

        self.thread = Thread("Thread for " + self.__class__.__name__, worker, logger)
        self.thread.parent.remove_child(self.thread)  # LOGGING WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.start()
Exemplo n.º 21
0
    def __init__(
        self,
        host,
        index,
        port=9200,
        type="log",
        queue_size=1000,
        batch_size=100,
        kwargs=None,
    ):
        """
        settings ARE FOR THE ELASTICSEARCH INDEX
        """
        kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds
        kwargs.retry.times = coalesce(kwargs.retry.times, 3)
        kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds
        kwargs.host = Random.sample(listwrap(host), 1)[0]

        schema = json2value(value2json(SCHEMA), leaves=True)
        schema.mappings[type].properties["~N~"].type = "nested"
        self.es = Cluster(kwargs).get_or_create_index(
            schema=schema,
            limit_replicas=True,
            typed=True,
            kwargs=kwargs,
        )
        self.batch_size = batch_size
        self.es.add_alias(coalesce(kwargs.alias, kwargs.index))
        self.queue = Queue("debug logs to es", max=queue_size, silent=True)

        self.worker = Thread.run("add debug logs to es", self._insert_loop)
Exemplo n.º 22
0
    def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(kwargs=kwargs)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.last_es_metadata = Date.now()-OLD_METADATA

        self.meta=Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Exemplo n.º 23
0
    def stop(cls):
        """
        DECONSTRUCTS ANY LOGGING, AND RETURNS TO DIRECT-TO-stdout LOGGING
        EXECUTING MULUTIPLE TIMES IN A ROW IS SAFE, IT HAS NO NET EFFECT, IT STILL LOGS TO stdout
        :return: NOTHING
        """

        from mo_threads import profiles

        if cls.cprofiler and hasattr(cls, "settings"):
            if cls.cprofiler == None:
                from mo_threads import Queue

                cls.cprofiler_stats = Queue(
                    "cprofiler stats"
                )  # ACCUMULATION OF STATS FROM ALL THREADS

            import pstats
            cls.cprofiler_stats.add(pstats.Stats(cls.cprofiler))
            write_profile(cls.settings.cprofile, cls.cprofiler_stats.pop_all())

        if profiles.ON and hasattr(cls, "settings"):
            profiles.write(cls.settings.profile)
        cls.main_log.stop()
        cls.main_log = StructuredLogger_usingStream(sys.stdout)
    def __init__(self, logger, period=PERIOD):
        if not isinstance(logger, StructuredLogger):
            Log.error("Expecting a StructuredLogger")

        self.logger = logger
        self.queue = Queue(
            "Queue for " + self.__class__.__name__,
            max=10000,
            silent=True,
            allow_add_after_close=True,
        )
        self.thread = Thread("Thread for " + self.__class__.__name__, worker,
                             logger, self.queue, period)
        # worker WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.parent.remove_child(self.thread)
        self.thread.start()
Exemplo n.º 25
0
 def __init__(self, es_cluster):
     Table.__init__(self, META_COLUMNS_NAME)
     self.data = {}  # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS)
     self.locker = Lock()
     self._schema = None
     self.dirty = False
     self.es_cluster = es_cluster
     self.es_index = None
     self.last_load = Null
     self.todo = Queue(
         "update columns to es"
     )  # HOLD (action, column) PAIR, WHERE action in ['insert', 'update']
     self._db_load()
     Thread.run("update " + META_COLUMNS_NAME,
                self._update_from_es,
                parent_thread=MAIN_THREAD)
Exemplo n.º 26
0
    def __init__(self, stream):
        assert stream

        if isinstance(stream, text_type):
            name = stream
            stream = self.stream = eval(stream)
            if name.startswith("sys.") and PY3:
                self.stream = Data(write=lambda d: stream.write(d.decode('utf8')))
        else:
            name = "stream"
            self.stream = stream

        # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD
        from mo_threads import Queue

        def utf8_appender(value):
            if isinstance(value, text_type):
                value = value.encode('utf8')
            self.stream.write(value)

        appender = utf8_appender

        self.queue = Queue("queue for " + self.__class__.__name__ + "(" + name + ")", max=10000, silent=True)
        self.thread = Thread("log to " + self.__class__.__name__ + "(" + name + ")", time_delta_pusher, appender=appender, queue=self.queue, interval=0.3)
        self.thread.parent.remove_child(self.thread)  # LOGGING WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.start()
Exemplo n.º 27
0
def update_local_database():
    # GET EVERYTHING WE HAVE SO FAR
    exists = summary_table.query({
        "select": ["id", "last_updated"],
        "where": {
            "and": [{
                "in": {
                    "id": candidates.id
                }
            }, {
                "exists": "num_pushes"
            }]
        },
        "sort": "last_updated",
        "limit": 100000,
        "format": "list",
    }).data
    # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT"
    missing = list(set(candidates.id) - set(exists.id))

    too_old = Date.today() - parse(LOCAL_RETENTION)
    needs_update = missing + [
        e for e in exists if e.last_updated < too_old.unix
    ]
    Log.alert("{{num}} series are candidates for local update",
              num=len(needs_update))

    limited_update = Queue("sigs")
    limited_update.extend(
        left(needs_update, coalesce(config.analysis.download_limit, 100)))
    Log.alert("Updating local database with {{num}} series",
              num=len(limited_update))

    with Timer("Updating local database"):

        def loop(please_stop):
            while not please_stop:
                sig_id = limited_update.pop_one()
                if not sig_id:
                    return
                process(sig_id)

        threads = [Thread.run(text(i), loop) for i in range(3)]
        for t in threads:
            t.join()

    Log.note("Local database is up to date")
Exemplo n.º 28
0
    def __init__(self, filename=None, db=None, upgrade=True, load_functions=False, kwargs=None):
        """
        :param db:  Optional, wrap a sqlite db in a thread
        :return: Multithread-safe database
        """
        if upgrade and not _upgraded:
            _upgrade()

        self.settings = kwargs
        self.filename = File(filename).abspath
        self.db = db
        self.queue = Queue("sql commands")   # HOLD (command, result, signal) PAIRS
        self.worker = Thread.run("sqlite db thread", self._worker)
        self.get_trace = TRACE
        self.upgrade = upgrade
        self.closed = False
        if DEBUG:
            Log.note("Sqlite version {{version}}", version=self.query("select sqlite_version()").data[0][0])
Exemplo n.º 29
0
def _late_import():
    global _Log

    from mo_logs import Log as _Log
    from mo_threads import Queue

    if _Log.cprofiler_stats == None:
        _Log.cprofiler_stats = Queue(
            "cprofiler stats")  # ACCUMULATION OF STATS FROM ALL THREADS
Exemplo n.º 30
0
    def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None):
        if hasattr(self, "settings"):
            return

        self.settings = kwargs
        self.too_old = TOO_OLD
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)
        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.meta = Data()
        self.meta.columns = ColumnList(self.es_cluster)
        self.meta.columns.extend(META_TABLES_DESC.columns)
        self.meta.tables = ListContainer(META_TABLES_NAME, [], jx_base.Schema(".", META_TABLES_DESC.columns))
        self.meta.table.extend([META_COLUMNS_DESC, META_TABLES_DESC])
        self.alias_to_query_paths = {}
        for i, settings in self.es_cluster.get_metadata().indices.items():
            if len(settings.aliases) == 0:
                alias = i
            elif len(settings.aliases) == 1:
                alias = first(settings.aliases)
            else:
                Log.error("expecting only one alias per index")

            desc = TableDesc(
                name=alias,
                url=None,
                query_path=ROOT_PATH,
                last_updated=Date.MIN,
                columns=[]
            )
            self.meta.tables.add(desc)
            self.alias_to_query_paths[alias] = [desc.query_path]
            self.alias_to_query_paths[self._find_alias(alias)] = [desc.query_path]

        # WE MUST PAUSE?

        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("not refresh metadata", self.not_monitor)
        return
Exemplo n.º 31
0
class StructuredLogger_usingThread(StructuredLogger):
    def __init__(self, logger):
        if not isinstance(logger, StructuredLogger):
            Log.error("Expecting a StructuredLogger")

        self.queue = Queue("Queue for " + self.__class__.__name__,
                           max=10000,
                           silent=True,
                           allow_add_after_close=True)
        self.logger = logger

        def worker(logger, please_stop):
            try:
                while not please_stop:
                    logs = self.queue.pop_all()
                    if not logs:
                        (Till(seconds=1) | please_stop).wait()
                        continue
                    for log in logs:
                        if log is THREAD_STOP:
                            please_stop.go()
                        else:
                            logger.write(**log)
            except Exception as e:
                print("problem in " + StructuredLogger_usingThread.__name__ +
                      ": " + str(e))
            finally:
                Log.note("stop the child")
                logger.stop()

        self.thread = Thread("Thread for " + self.__class__.__name__, worker,
                             logger)
        self.thread.parent.remove_child(
            self.thread)  # LOGGING WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.start()

    def write(self, template, params):
        try:
            self.queue.add({"template": template, "params": params})
            return self
        except Exception as e:
            e = Except.wrap(e)
            raise e  # OH NO!

    def stop(self):
        try:
            self.queue.add(
                THREAD_STOP)  # BE PATIENT, LET REST OF MESSAGE BE SENT
            self.thread.join()
            Log.note("joined on thread")
        except Exception as e:
            Log.note("problem in threaded logger" + str(e))

        with suppress_exception:
            self.queue.close()
Exemplo n.º 32
0
    def _find_revision(self, revision):
        please_stop = False
        locker = Lock()
        output = []
        queue = Queue("branches", max=2000)
        queue.extend(b for b in self.branches if b.locale == DEFAULT_LOCALE and b.name in ["try", "mozilla-inbound", "autoland"])
        queue.add(THREAD_STOP)

        problems = []
        def _find(please_stop):
            for b in queue:
                if please_stop:
                    return
                try:
                    url = b.url + "json-info?node=" + revision
                    rev = self.get_revision(Revision(branch=b, changeset={"id": revision}))
                    with locker:
                        output.append(rev)
                    Log.note("Revision found at {{url}}", url=url)
                except Exception as f:
                    problems.append(f)

        threads = []
        for i in range(3):
            threads.append(Thread.run("find changeset " + text_type(i), _find, please_stop=please_stop))

        for t in threads:
            with assert_no_exception:
                t.join()

        return output
Exemplo n.º 33
0
    def __init__(self, filename=None, db=None, upgrade=True):
        """
        :param db:  Optional, wrap a sqlite db in a thread
        :return: Multithread-safe database
        """
        if upgrade and not _upgraded:
            _upgrade()

        self.filename = filename
        self.db = db
        self.queue = Queue("sql commands")   # HOLD (command, result, signal) PAIRS
        self.worker = Thread.run("sqlite db thread", self._worker)
        self.get_trace = DEBUG
        self.upgrade = upgrade
Exemplo n.º 34
0
class StructuredLogger_usingQueue(StructuredLogger):
    def __init__(self, name=None):
        queue_name = "log messages to queue"
        if name:
            queue_name += " " + name
        self.queue = Queue(queue_name)

    def write(self, template, params):
        self.queue.add(expand_template(template, params))

    def stop(self):
        self.queue.close()

    def pop(self):
        lines = self.queue.pop()
        output = []
        for l in lines.split("\n"):
            if l[19:22] == " - ":
                l = l[22:]
            if l.strip().startswith("File"):
                continue
            output.append(l)
        return "\n".join(output).strip()
Exemplo n.º 35
0
    def __init__(self,
                 host,
                 index,
                 sql_file='metadata.sqlite',
                 alias=None,
                 name=None,
                 port=9200,
                 kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = elasticsearch.Cluster(kwargs=kwargs)
        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.abs_columns = set()
        self.last_es_metadata = Date.now() - OLD_METADATA

        self.meta = Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer(
            "meta.tables", [], wrap({c.names["."]: c
                                     for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Exemplo n.º 36
0
    def __init__(
        self,
        name,
        broker=None,
        include=None,
        **kwargs
    ):
        self.Task = MethodCaller

        self.name = name
        self.request_queue = Queue(name=name+" requests")
        self.response_queue = Queue(name=name+" responses")
        self.kwargs = kwargs
        self.include = include
        self.broker = broker
        self._config = {}
        self._tasks = {}
        self.on_init()
        self.response_worker = Thread.run("response worker", self._response_worker)
        self.responses = {}
        self.responses_lock = Lock()
        self.id_lock = Lock()
        self.next_id = 1
        self.worker = Worker(self.request_queue, self.response_queue, celery=self)
Exemplo n.º 37
0
class StructuredLogger_usingQueue(StructuredLogger):

    def __init__(self, name=None):
        queue_name = "log messages to queue"
        if name:
            queue_name += " "+name
        self.queue = Queue(queue_name)

    def write(self, template, params):
        self.queue.add(expand_template(template, params))

    def stop(self):
        self.queue.close()

    def pop(self):
        lines = self.queue.pop()
        output = []
        for l in lines.split("\n"):
            if l[19:22] == " - ":
                l = l[22:]
            if l.strip().startswith("File"):
                continue
            output.append(l)
        return "\n".join(output).strip()
 def __init__(self, host, index, port=9200, type="log", max_size=1000, batch_size=100, kwargs=None):
     """
     settings ARE FOR THE ELASTICSEARCH INDEX
     """
     self.es = Cluster(kwargs).get_or_create_index(
         schema=mo_json.json2value(value2json(SCHEMA), leaves=True),
         limit_replicas=True,
         tjson=True,
         kwargs=kwargs
     )
     self.batch_size = batch_size
     self.es.add_alias(coalesce(kwargs.alias, kwargs.index))
     self.queue = Queue("debug logs to es", max=max_size, silent=True)
     self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3)
     self.es.settings.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE))
     Thread.run("add debug logs to es", self._insert_loop)
Exemplo n.º 39
0
class StructuredLogger_usingThreadedStream(StructuredLogger):
    # stream CAN BE AN OBJCET WITH write() METHOD, OR A STRING
    # WHICH WILL eval() TO ONE
    def __init__(self, stream):
        assert stream

        use_UTF8 = False

        if isinstance(stream, basestring):
            if stream.startswith("sys."):
                use_UTF8 = True  # sys.* ARE OLD AND CAN NOT HANDLE unicode
            self.stream = eval(stream)
            name = stream
        else:
            self.stream = stream
            name = "stream"

        # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD
        from mo_threads import Queue

        if use_UTF8:
            def utf8_appender(value):
                if isinstance(value, unicode):
                    value = value.encode('utf8')
                self.stream.write(value)

            appender = utf8_appender
        else:
            appender = self.stream.write

        self.queue = Queue("queue for " + self.__class__.__name__ + "(" + name + ")", max=10000, silent=True)
        self.thread = Thread("log to " + self.__class__.__name__ + "(" + name + ")", time_delta_pusher, appender=appender, queue=self.queue, interval=0.3)
        self.thread.parent.remove_child(self.thread)  # LOGGING WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.start()

    def write(self, template, params):
        try:
            self.queue.add({"template": template, "params": params})
            return self
        except Exception as e:
            raise e  # OH NO!

    def stop(self):
        try:
            self.queue.add(THREAD_STOP)  # BE PATIENT, LET REST OF MESSAGE BE SENT
            self.thread.join()
        except Exception as e:
            if DEBUG_LOGGING:
                raise e

        try:
            self.queue.close()
        except Exception, f:
            if DEBUG_LOGGING:
                raise f
Exemplo n.º 40
0
class StructuredLogger_usingThread(StructuredLogger):

    def __init__(self, logger):
        if not isinstance(logger, StructuredLogger):
            Log.error("Expecting a StructuredLogger")

        self.queue = Queue("Queue for " + self.__class__.__name__, max=10000, silent=True, allow_add_after_close=True)
        self.logger = logger

        def worker(logger, please_stop):
            try:
                while not please_stop:
                    logs = self.queue.pop_all()
                    if not logs:
                        (Till(seconds=1) | please_stop).wait()
                        continue
                    for log in logs:
                        if log is THREAD_STOP:
                            please_stop.go()
                        else:
                            logger.write(**log)
            except Exception as e:
                print("problem in " + StructuredLogger_usingThread.__name__ + ": " + str(e))
            finally:
                Log.note("stop the child")
                logger.stop()

        self.thread = Thread("Thread for " + self.__class__.__name__, worker, logger)
        self.thread.parent.remove_child(self.thread)  # LOGGING WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.start()

    def write(self, template, params):
        try:
            self.queue.add({"template": template, "params": params})
            return self
        except Exception as e:
            e = Except.wrap(e)
            raise e  # OH NO!

    def stop(self):
        try:
            self.queue.add(THREAD_STOP)  # BE PATIENT, LET REST OF MESSAGE BE SENT
            self.thread.join()
            Log.note("joined on thread")
        except Exception as e:
            Log.note("problem in threaded logger" + str(e))

        with suppress_exception:
            self.queue.close()
Exemplo n.º 41
0
    def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)

        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.index_to_alias = Relation_usingList()

        self.es_metadata = Null
        self.metadata_last_updated = Date.now() - OLD_METADATA

        self.meta = Data()
        self.meta.columns = ColumnList()

        self.alias_to_query_paths = {
            "meta.columns": [['.']],
            "meta.tables": [['.']]
        }
        self.alias_last_updated = {
            "meta.columns": Date.now(),
            "meta.tables": Date.now()
        }
        table_columns = metadata_tables()
        self.meta.tables = ListContainer(
            "meta.tables",
            [
                # TableDesc("meta.columns", None, ".", Date.now()),
                # TableDesc("meta.tables", None, ".", Date.now())
            ],
            jx_base.Schema(".", table_columns)
        )
        self.meta.columns.extend(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Exemplo n.º 42
0
    def stop(cls):
        from mo_logs import profiles

        if cls.cprofiler and hasattr(cls, "settings"):
            if cls.cprofiler == None:
                from mo_threads import Queue

                cls.cprofiler_stats = Queue(
                    "cprofiler stats"
                )  # ACCUMULATION OF STATS FROM ALL THREADS

            import pstats
            cls.cprofiler_stats.add(pstats.Stats(cls.cprofiler))
            write_profile(cls.settings.cprofile, cls.cprofiler_stats.pop_all())

        if profiles.ON and hasattr(cls, "settings"):
            profiles.write(cls.settings.profile)
        cls.main_log.stop()
        cls.main_log = StructuredLogger_usingStream(sys.stdout)
Exemplo n.º 43
0
class StructuredLogger_usingThreadedStream(StructuredLogger):
    # stream CAN BE AN OBJCET WITH write() METHOD, OR A STRING
    # WHICH WILL eval() TO ONE
    def __init__(self, stream):
        assert stream

        if isinstance(stream, text_type):
            name = stream
            stream = self.stream = eval(stream)
            if name.startswith("sys.") and PY3:
                self.stream = Data(write=lambda d: stream.write(d.decode('utf8')))
        else:
            name = "stream"
            self.stream = stream

        # WRITE TO STREAMS CAN BE *REALLY* SLOW, WE WILL USE A THREAD
        from mo_threads import Queue

        def utf8_appender(value):
            if isinstance(value, text_type):
                value = value.encode('utf8')
            self.stream.write(value)

        appender = utf8_appender

        self.queue = Queue("queue for " + self.__class__.__name__ + "(" + name + ")", max=10000, silent=True)
        self.thread = Thread("log to " + self.__class__.__name__ + "(" + name + ")", time_delta_pusher, appender=appender, queue=self.queue, interval=0.3)
        self.thread.parent.remove_child(self.thread)  # LOGGING WILL BE RESPONSIBLE FOR THREAD stop()
        self.thread.start()

    def write(self, template, params):
        try:
            self.queue.add({"template": template, "params": params})
            return self
        except Exception as e:
            raise e  # OH NO!

    def stop(self):
        try:
            self.queue.add(THREAD_STOP)  # BE PATIENT, LET REST OF MESSAGE BE SENT
            self.thread.join()
        except Exception as e:
            if DEBUG_LOGGING:
                raise e

        try:
            self.queue.close()
        except Exception as f:
            if DEBUG_LOGGING:
                raise f
Exemplo n.º 44
0
 def __init__(self, name=None):
     queue_name = "log messages to queue"
     if name:
         queue_name += " "+name
     self.queue = Queue(queue_name)
Exemplo n.º 45
0
class ElasticsearchMetadata(Namespace):
    """
    MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER
    """

    @override
    def __new__(cls, kwargs, *args, **_kwargs):
        es_cluster = elasticsearch.Cluster(kwargs)
        output = known_clusters.get(id(es_cluster))
        if output is None:
            output = object.__new__(cls)
            known_clusters[id(es_cluster)] = output
        return output

    @override
    def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)

        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.index_to_alias = {}

        self.es_metadata = Null
        self.metadata_last_updated = Date.now() - OLD_METADATA

        self.meta = Data()
        self.meta.columns = ColumnList(URL(self.es_cluster.settings.host).host)

        self.alias_to_query_paths = {
            "meta.columns": [ROOT_PATH],
            "meta.tables": [ROOT_PATH]
        }
        self.alias_last_updated = {
            "meta.columns": Date.now(),
            "meta.tables": Date.now()
        }
        table_columns = metadata_tables()
        self.meta.tables = ListContainer(
            "meta.tables",
            [],
            jx_base.Schema(".", table_columns)
        )
        self.meta.columns.extend(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("not refresh metadata", self.not_monitor)
        return




    @property
    def namespace(self):
        return self.meta.columns.namespace

    @property
    def url(self):
        return self.es_cluster.url / self.default_name.replace(".", "/")

    def _reload_columns(self, table_desc):
        """
        :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS)
        :return:
        """
        # FIND ALL INDEXES OF ALIAS
        es_last_updated = self.es_cluster.metatdata_last_updated

        alias = table_desc.name
        canonical_index = self.es_cluster.get_best_matching_index(alias).index
        es_metadata_update_required = not (table_desc.timestamp < es_last_updated)
        metadata = self.es_cluster.get_metadata(force=es_metadata_update_required)

        props = [
            (self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties)
            for i, d in metadata.indices.items()
            if alias in d.aliases
            for t, m in [_get_best_type_from_mapping(d.mappings)]
        ]

        # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT
        dirty = False
        all_comparisions = list(jx.pairwise(props)) + list(jx.pairwise(jx.reverse(props)))
        # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE
        for (i1, t1, p1), (i2, t2, p2) in all_comparisions:
            diff = elasticsearch.diff_schema(p2, p1)
            if not self.settings.read_only:
                for d in diff:
                    dirty = True
                    i1.add_property(*d)
        meta = self.es_cluster.get_metadata(force=dirty).indices[canonical_index]

        data_type, mapping = _get_best_type_from_mapping(meta.mappings)
        mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"}
        columns = self._parse_properties(alias, mapping)
        table_desc.timestamp = es_last_updated
        return columns

    def _parse_properties(self, alias, mapping):
        abs_columns = elasticsearch.parse_properties(alias, ".", ROOT_PATH, mapping.properties)
        if DEBUG and any(c.cardinality == 0 and c.name != '_id' for c in abs_columns):
            Log.warning(
                "Some columns are not stored in {{url}} {{index|quote}} table:\n{{names}}",
                url=self.es_cluster.url,
                index=alias,
                names=[
                    ".".join((c.es_index, c.name))
                    for c in abs_columns
                    if c.cardinality == 0
                ]
            )

        with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(".")
            query_paths.append(ROOT_PATH)

            # ENSURE ALL TABLES HAVE THE QUERY PATHS SET
            self.alias_to_query_paths[alias] = query_paths
            for i, a in self.index_to_alias.items():
                if a == alias:
                    self.alias_to_query_paths[i] = query_paths

            # ENSURE COLUMN HAS CORRECT jx_type
            # PICK DEEPEST NESTED PROPERTY AS REPRESENTATIVE
            output = []
            best = {}
            for abs_column in abs_columns:
                abs_column.jx_type = jx_type(abs_column)
                if abs_column.jx_type not in STRUCT:
                    clean_name = unnest_path(abs_column.name)
                    other = best.get(clean_name)
                    if other:
                        if len(other.nested_path) < len(abs_column.nested_path):
                            output.remove(other)
                            self.meta.columns.update({"clear": ".", "where": {"eq": {"es_column": other.es_column, "es_index": other.es_index}}})
                        else:
                            continue
                    best[clean_name] = abs_column
                output.append(abs_column)

            # REGISTER ALL COLUMNS
            canonicals = []
            for abs_column in output:
                canonical = self.meta.columns.add(abs_column)
                canonicals.append(canonical)

            self.todo.extend(canonicals)
            return canonicals

    def query(self, _query):
        return self.meta.columns.query(QueryOp(set_default(
            {
                "from": self.meta.columns,
                "sort": ["table", "name"]
            },
            _query.__data__()
        )))

    def _find_alias(self, name):
        if self.metadata_last_updated < self.es_cluster.metatdata_last_updated:
            for a in self.es_cluster.get_aliases():
                self.index_to_alias[a.index] = coalesce(a.alias, a.index)
                self.alias_last_updated.setdefault(a.alias, Date.MIN)
        if name in self.alias_last_updated:
            return name
        else:
            return self.index_to_alias.get(name)

    def get_columns(self, table_name, column_name=None, after=None, timeout=None):
        """
        RETURN METADATA COLUMNS

        :param table_name: TABLE WE WANT COLUMNS FOR
        :param column_name:  OPTIONAL NAME, IF INTERESTED IN ONLY ONE COLUMN
        :param after: FORCE LOAD, WAITING FOR last_updated TO BE AFTER THIS TIME
        :param timeout: Signal; True when should give up
        :return:
        """
        DEBUG and after and Log.note("getting columns for after {{time}}", time=after)
        table_path = split_field(table_name)
        root_table_name = table_path[0]

        alias = self._find_alias(root_table_name)
        if not alias:
            self.es_cluster.get_metadata(force=True)
            alias = self._find_alias(root_table_name)
            if not alias:
                Log.error("{{table|quote}} does not exist", table=table_name)

        try:
            table = self.get_table(alias)[0]
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = TableDesc(
                    name=alias,
                    url=None,
                    query_path=["."],
                    timestamp=Date.MIN
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                columns = self._reload_columns(table)
                DEBUG and Log.note("columns from reload")
            elif after or table.timestamp < self.es_cluster.metatdata_last_updated:
                columns = self._reload_columns(table)
                DEBUG and Log.note("columns from reload")
            else:
                columns = self.meta.columns.find(alias, column_name)
                DEBUG and Log.note("columns from find()")

            DEBUG and Log.note("columns are {{ids}}", ids=[id(c) for c in columns])

            columns = jx.sort(columns, "name")

            if after is None:
                return columns  # DO NOT WAIT FOR COMPLETE COLUMNS

            # WAIT FOR THE COLUMNS TO UPDATE
            while True:
                pending = [c for c in columns if after >= c.last_updated or (c.cardinality == None and c.jx_type not in STRUCT)]
                if not pending:
                    break
                if timeout:
                    Log.error("trying to gets columns timed out")
                if DEBUG:
                    if len(pending) > 10:
                        Log.note("waiting for {{num}} columns to update by {{timestamp}}", num=len(pending), timestamp=after)
                    else:
                        Log.note("waiting for columns to update by {{timestamp}}; {{columns|json}}", timestamp=after, columns=[c.es_index + "." + c.es_column + " id="+text_type(id(c)) for c in pending])
                Till(seconds=1).wait()
            return columns
        except Exception as e:
            Log.error("Failure to get columns for {{table}}", table=table_name, cause=e)

        return []

    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        now = Date.now()
        if column.es_index in self.index_does_not_exist:
            return

        if column.jx_type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.columns),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            if column.es_index == "meta.tables":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.tables),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": {"filter": {"match_all": {}}}
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = max(1001, count)
                multi = 1001
            elif column.es_column == "_id":
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "query": {"match_all": {}},
                    "size": 0
                })
                count = cardinality = result.hits.total
                multi = 1
            elif column.es_type == BOOLEAN:
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": _counting_query(column)
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = 2

                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": [False, True],
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            else:
                es_query = {
                    "aggs": {
                        "count": _counting_query(column),
                        "_filter": {
                            "aggs": {"multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}}},
                            "filter": {"bool": {"should": [
                                {"range": {"etl.timestamp.~n~": {"gte": (Date.today() - WEEK)}}},
                                {"bool": {"must_not": {"exists": {"field": "etl.timestamp.~n~"}}}}
                            ]}}
                        }
                    },
                    "size": 0
                }

                result = self.es_cluster.post("/" + es_index + "/_search", data=es_query)
                agg_results = result.aggregations
                count = result.hits.total
                cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count)
                multi = int(coalesce(agg_results._filter.multi.value, 1))
                if cardinality == None:
                    Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                self.meta.columns.update({
                    "set": {
                        "count": cardinality,
                        "cardinality": cardinality,
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {"path": column.nested_path[0]},
                    "aggs": {"_nested": {"terms": {"field": column.es_column}}}
                }
            elif cardinality == 0:  # WHEN DOES THIS HAPPEN?
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}}

            result = self.es_cluster.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            DEBUG and Log.note("update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}", id=id(column), column=column, time=now)
            self.meta.columns.update({
                "set": {
                    "count": count,
                    "cardinality": cardinality,
                    "multi": multi,
                    "partitions": parts,
                    "last_updated": now
                },
                "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
            })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            e = Except.wrap(e)
            TEST_TABLE = "testdata"
            is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"])
            is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE))
            if is_missing_index:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                Log.warning("Missing index {{col.es_index}}", col=column, cause=e)
                self.meta.columns.update({
                    "clear": ".",
                    "where": {"eq": {"es_index": column.es_index}}
                })
                self.index_does_not_exist.add(column.es_index)
            elif "No field found for" in e:
                self.meta.columns.update({
                    "clear": ".",
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                Log.warning("Could not get column {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": now
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)

    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    old_columns = [
                        c
                        for c in self.meta.columns
                        if ((c.last_updated < Date.now() - MAX_COLUMN_METADATA_AGE) or c.cardinality == None) and c.jx_type not in STRUCT
                    ]
                    if old_columns:
                        DEBUG and Log.note(
                            "Old columns {{names|json}} last updated {{dates|json}}",
                            names=wrap(old_columns).es_column,
                            dates=[Date(t).format() for t in wrap(old_columns).last_updated]
                        )
                        self.todo.extend(old_columns)
                    else:
                        DEBUG and Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10*MINUTE).seconds))
                if column:
                    if column is THREAD_STOP:
                        continue

                    with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG):
                        if column.es_index in self.index_does_not_exist:
                            DEBUG and Log.note("{{column.es_column}} does not exist", column=column)
                            self.meta.columns.update({
                                "clear": ".",
                                "where": {"eq": {"es_index": column.es_index}}
                            })
                            continue
                        if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE:
                            DEBUG and Log.note("{{column.es_column}} is a struct", column=column)
                            column.last_updated = Date.now()
                            continue
                        elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None:
                            # DO NOT UPDATE FRESH COLUMN METADATA
                            DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds)
                            continue
                        try:
                            self._update_cardinality(column)
                            (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column)
                        except Exception as e:
                            if '"status":404' in e:
                                self.meta.columns.update({
                                    "clear": ".",
                                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                                })
                            else:
                                Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)

    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            column = self.todo.pop()
            if column == THREAD_STOP:
                break

            if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE:
                DEBUG and Log.note("{{column.es_column}} is a struct", column=column)
                column.last_updated = Date.now()
                continue
            elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None:
                # DO NOT UPDATE FRESH COLUMN METADATA
                DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds)
                continue

            with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05):
                if untype_path(column.name) in ["build.type", "run.type"]:
                    try:
                        self._update_cardinality(column)
                    except Exception as e:
                        Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
                else:
                    column.last_updated = Date.now()


    def get_table(self, name):
        if name == "meta.columns":
            return self.meta.columns

        with self.meta.tables.locker:
            return wrap([t for t in self.meta.tables.data if t.name == name])

    def get_snowflake(self, fact_table_name):
        return Snowflake(fact_table_name, self)

    def get_schema(self, name):
        if name == "meta.columns":
            return self.meta.columns.schema
        if name == "meta.tables":
            return self.meta.tables
        root, rest = tail_field(name)
        return self.get_snowflake(root).get_schema(rest)
Exemplo n.º 46
0
class FromESMetadata(Schema):
    """
    QUERY THE METADATA
    """

    def __new__(cls, *args, **kwargs):
        global singlton
        if singlton:
            return singlton
        else:
            singlton = object.__new__(cls)
            return singlton

    @override
    def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(kwargs=kwargs)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.last_es_metadata = Date.now()-OLD_METADATA

        self.meta=Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return

    @property
    def query_path(self):
        return None

    @property
    def url(self):
        return self.default_es.path + "/" + self.default_name.replace(".", "/")

    def get_table(self, table_name):
        with self.meta.tables.locker:
            return wrap([t for t in self.meta.tables.data if t.name == table_name])

    def _upsert_column(self, c):
        # ASSUMING THE  self.meta.columns.locker IS HAD
        existing_columns = self.meta.columns.find(c.es_index, c.names["."])
        if not existing_columns:
            self.meta.columns.add(c)
            self.todo.add(c)

            if ENABLE_META_SCAN:
                if DEBUG:
                    Log.note("todo: {{table}}::{{column}}", table=c.es_index, column=c.es_column)
                # MARK meta.columns AS DIRTY TOO
                cols = self.meta.columns.find("meta.columns", None)
                for cc in cols:
                    cc.partitions = cc.cardinality = None
                    cc.last_updated = Date.now()
                self.todo.extend(cols)
        else:
            canonical = existing_columns[0]
            if canonical is not c:
                set_default(c.names, canonical.names)
                for key in Column.__slots__:
                    canonical[key] = c[key]
            if DEBUG:
                Log.note("todo: {{table}}::{{column}}", table=canonical.es_index, column=canonical.es_column)
            self.todo.add(canonical)

    def _get_columns(self, table=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        table_path = split_field(table)
        es_index = table_path[0]
        query_path = join_field(table_path[1:])
        meta = self.es_metadata.indices[es_index]
        if not meta or self.last_es_metadata < Date.now() - OLD_METADATA:
            self.es_metadata = self.default_es.get_metadata(force=True)
            meta = self.es_metadata.indices[es_index]

        for _, properties in meta.mappings.items():
            properties.properties["_id"] = {"type": "string", "index": "not_analyzed"}
            self._parse_properties(meta.index, properties, meta)

    def _parse_properties(self, abs_index, properties, meta):
        # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND
        # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES
        abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties)
        abs_columns = abs_columns.filter(  # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED
            lambda r: not r.es_column.startswith("other.") and
                      not r.es_column.startswith("previous_values.cf_") and
                      not r.es_index.startswith("debug") and
                      r.es_column.find("=") == -1 and
                      r.es_column.find(" ") == -1
        )

        def add_column(c, query_path):
            c.last_updated = Date.now()
            if query_path[0] != ".":
                c.names[query_path[0]] = relative_field(c.names["."], query_path[0])

            with self.meta.columns.locker:
                self._upsert_column(c)
                for alias in meta.aliases:
                    c = copy(c)
                    c.es_index = alias
                    self._upsert_column(c)

        with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(".")
            query_paths.append(ROOT_PATH)

            # ADD RELATIVE COLUMNS
            for abs_column in abs_columns:
                for query_path in query_paths:
                    add_column(abs_column, query_path)

    def query(self, _query):
        return self.meta.columns.query(QueryOp(set_default(
            {
                "from": self.meta.columns,
                "sort": ["table", "name"]
            },
            _query.__data__()
        )))

    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        es_index_name = table_path[0]
        query_path = join_field(table_path[1:])
        table = self.get_table(es_index_name)[0]
        abs_column_name = None if column_name == None else concat_field(query_path, column_name)

        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = Table(
                    name=es_index_name,
                    url=None,
                    query_path=None,
                    timestamp=Date.now()
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=es_index_name)
            elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=es_index_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(es_index_name, column_name)
            if columns:
                columns = jx.sort(columns, "names.\.")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    if DEBUG:
                        Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated])
                    Till(seconds=1).wait()
                return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        if abs_column_name:
            Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name)
        else:
            self._get_columns(table=table_name)  # TO TEST WHAT HAPPENED
            Log.error("no columns for {{table}}?!", table=table_name)

    def _update_cardinality(self, c):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if c.type in STRUCT:
            Log.error("not supported")
        try:
            if c.es_index == "meta.columns":
                with self.meta.columns.locker:
                    partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.columns),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            if c.es_index == "meta.tables":
                with self.meta.columns.locker:
                    partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.tables),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return

            es_index = c.es_index.split(".")[0]
            result = self.default_es.post("/" + es_index + "/_search", data={
                "aggs": {c.names["."]: _counting_query(c)},
                "size": 0
            })
            r = result.aggregations.values()[0]
            count = result.hits.total
            cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count==0 else None)
            if cardinality == None:
                Log.error("logic error")

            query = Data(size=0)
            if cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                if DEBUG:
                    Log.note("{{table}}.{{field}} has {{num}} parts", table=c.es_index, field=c.es_column, num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                if DEBUG:
                    Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            elif len(c.nested_path) != 1:
                query.aggs[literal_field(c.names["."])] = {
                    "nested": {"path": c.nested_path[0]},
                    "aggs": {"_nested": {"terms": {"field": c.es_column, "size": 0}}}
                }
            else:
                query.aggs[literal_field(c.names["."])] = {"terms": {"field": c.es_column, "size": 0}}

            result = self.default_es.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations.values()[0]
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            if DEBUG:
                Log.note("{{field}} has {{parts}}", field=c.name, parts=parts)
            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })
        except Exception as e:
            if "IndexMissingException" in e and c.es_index.startswith(TEST_TABLE_PREFIX):
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": 0,
                            "cardinality": 0,
                            "last_updated": Date.now()
                        },
                        "clear":[
                            "partitions"
                        ],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"names.\\.": ".", "es_index": c.es_index, "es_column": c.es_column}}
                })
                Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=c, cause=e)

    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    with self.meta.columns.locker:
                        old_columns = [
                            c
                            for c in self.meta.columns
                            if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.type not in STRUCT
                        ]
                        if old_columns:
                            if DEBUG:
                                Log.note("Old columns wth dates {{dates|json}}", dates=wrap(old_columns).last_updated)
                            self.todo.extend(old_columns)
                            # TEST CONSISTENCY
                            for c, d in product(list(self.todo.queue), list(self.todo.queue)):
                                if c.es_column == d.es_column and c.es_index == d.es_index and c != d:
                                    Log.error("")
                        else:
                            if DEBUG:
                                Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10*MINUTE).seconds))
                if column:
                    if DEBUG:
                        Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column)
                    if column.type in STRUCT:
                        with self.meta.columns.locker:
                            column.last_updated = Date.now()
                        continue
                    elif column.last_updated >= Date.now()-TOO_OLD:
                        continue
                    try:
                        self._update_cardinality(column)
                        if DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX):
                            Log.note("updated {{column.name}}", column=column)
                    except Exception as e:
                        Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)

    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == THREAD_STOP:
                break

            if not c.last_updated or c.last_updated >= Date.now()-TOO_OLD:
                continue

            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear":[
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })
            if DEBUG:
                Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
Exemplo n.º 47
0
class Cache(object):
    """
    For Caching hg.mo requests
    """

    @override
    def __init__(self, rate=None, amortization_period=None, source=None, database=None, kwargs=None):
        self.amortization_period = coalesce(amortization_period, AMORTIZATION_PERIOD)
        self.rate = coalesce(rate, HG_REQUEST_PER_SECOND)
        self.cache_locker = Lock()
        self.cache = {}  # MAP FROM url TO (ready, headers, response, timestamp) PAIR
        self.no_cache = {}  # VERY SHORT TERM CACHE
        self.workers = []
        self.todo = Queue(APP_NAME+" todo")
        self.requests = Queue(APP_NAME + " requests", max=int(self.rate * self.amortization_period.seconds))
        self.url = URL(source.url)
        self.db = Sqlite(database)
        self.inbound_rate = RateLogger("Inbound")
        self.outbound_rate = RateLogger("hg.mo")

        if not self.db.query("SELECT name FROM sqlite_master WHERE type='table'").data:
            with self.db.transaction() as t:
                t.execute(
                    "CREATE TABLE cache ("
                    "   path TEXT PRIMARY KEY, "
                    "   headers TEXT, "
                    "   response TEXT, "
                    "   timestamp REAL "
                    ")"
                )

        self.threads = [
            Thread.run(APP_NAME+" worker" + text_type(i), self._worker)
            for i in range(CONCURRENCY)
        ]
        self.limiter = Thread.run(APP_NAME+" limiter", self._rate_limiter)
        self.cleaner = Thread.run(APP_NAME+" cleaner", self._cache_cleaner)

    def _rate_limiter(self, please_stop):
        try:
            max_requests = self.requests.max
            recent_requests = []

            while not please_stop:
                now = Date.now()
                too_old = now - self.amortization_period

                recent_requests = [t for t in recent_requests if t > too_old]

                num_recent = len(recent_requests)
                if num_recent >= max_requests:
                    space_free_at = recent_requests[0] + self.amortization_period
                    (please_stop | Till(till=space_free_at.unix)).wait()
                    continue
                for _ in xrange(num_recent, max_requests):
                    request = self.todo.pop()
                    now = Date.now()
                    recent_requests.append(now)
                    self.requests.add(request)
        except Exception as e:
            Log.warning("failure", cause=e)

    def _cache_cleaner(self, please_stop):
        while not please_stop:
            now = Date.now()
            too_old = now-CACHE_RETENTION

            remove = set()
            with self.cache_locker:
                for path, (ready, headers, response, timestamp) in self.cache:
                    if timestamp < too_old:
                        remove.add(path)
                for r in remove:
                    del self.cache[r]
            (please_stop | Till(seconds=CACHE_RETENTION.seconds / 2)).wait()

    def please_cache(self, path):
        """
        :return: False if `path` is not to be cached
        """
        if path.endswith("/tip"):
            return False
        if any(k in path for k in ["/json-annotate/", "/json-info/", "/json-log/", "/json-rev/", "/rev/", "/raw-rev/", "/raw-file/", "/json-pushes", "/pushloghtml", "/file/"]):
            return True

        return False

    def request(self, method, path, headers):
        now = Date.now()
        self.inbound_rate.add(now)
        ready = Signal(path)

        # TEST CACHE
        with self.cache_locker:
            pair = self.cache.get(path)
            if pair is None:
                self.cache[path] = (ready, None, None, now)


        if pair is not None:
            # REQUEST IS IN THE QUEUE ALREADY, WAIT
            ready, headers, response, then = pair
            if response is None:
                ready.wait()
                with self.cache_locker:
                    ready, headers, response, timestamp = self.cache.get(path)
            with self.db.transaction() as t:
                t.execute("UPDATE cache SET timestamp=" + quote_value(now) + " WHERE path=" + quote_value(path) + " AND timestamp<" + quote_value(now))
            return Response(
                response,
                status=200,
                headers=json.loads(headers)
            )

        # TEST DB
        db_response = self.db.query("SELECT headers, response FROM cache WHERE path=" + quote_value(path)).data
        if db_response:
            headers, response = db_response[0]
            with self.db.transaction() as t:
                t.execute("UPDATE cache SET timestamp=" + quote_value(now) + " WHERE path=" + quote_value(path) + " AND timestamp<" + quote_value(now))
            with self.cache_locker:
                self.cache[path] = (ready, headers, response.encode('latin1'), now)
            ready.go()

            return Response(
                response,
                status=200,
                headers=json.loads(headers)
            )

        # MAKE A NETWORK REQUEST
        self.todo.add((ready, method, path, headers, now))
        ready.wait()
        with self.cache_locker:
            ready, headers, response, timestamp = self.cache[path]
        return Response(
            response,
            status=200,
            headers=json.loads(headers)
        )

    def _worker(self, please_stop):
        while not please_stop:
            pair = self.requests.pop(till=please_stop)
            if please_stop:
                break
            ready, method, path, req_headers, timestamp = pair

            try:
                url = self.url / path
                self.outbound_rate.add(Date.now())
                response = http.request(method, url, req_headers)

                del response.headers['transfer-encoding']
                resp_headers = value2json(response.headers)
                resp_content = response.raw.read()

                please_cache = self.please_cache(path)
                if please_cache:
                    with self.db.transaction() as t:
                        t.execute("INSERT INTO cache (path, headers, response, timestamp) VALUES" + quote_list((path, resp_headers, resp_content.decode('latin1'), timestamp)))
                with self.cache_locker:
                    self.cache[path] = (ready, resp_headers, resp_content, timestamp)
            except Exception as e:
                Log.warning("problem with request to {{path}}", path=path, cause=e)
                with self.cache_locker:
                    ready, headers, response = self.cache[path]
                    del self.cache[path]
            finally:
                ready.go()
Exemplo n.º 48
0
class Sqlite(DB):
    """
    Allows multi-threaded access
    Loads extension functions (like SQRT)
    """

    canonical = None

    def __init__(self, filename=None, db=None, upgrade=True):
        """
        :param db:  Optional, wrap a sqlite db in a thread
        :return: Multithread-safe database
        """
        if upgrade and not _upgraded:
            _upgrade()

        self.filename = filename
        self.db = db
        self.queue = Queue("sql commands")   # HOLD (command, result, signal) PAIRS
        self.worker = Thread.run("sqlite db thread", self._worker)
        self.get_trace = DEBUG
        self.upgrade = upgrade

    def _enhancements(self):
        def regex(pattern, value):
            return 1 if re.match(pattern+"$", value) else 0
        con = self.db.create_function("regex", 2, regex)

        class Percentile(object):
            def __init__(self, percentile):
                self.percentile=percentile
                self.acc=[]

            def step(self, value):
                self.acc.append(value)

            def finalize(self):
                return percentile(self.acc, self.percentile)

        con.create_aggregate("percentile", 2, Percentile)

    def execute(self, command):
        """
        COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN
        BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS
        :param command: COMMAND FOR SQLITE
        :return: None
        """
        if DEBUG:  # EXECUTE IMMEDIATELY FOR BETTER STACK TRACE
            return self.query(command)

        if self.get_trace:
            trace = extract_stack(1)
        else:
            trace = None
        self.queue.add((command, None, None, trace))

    def query(self, command):
        """
        WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED
        :param command: COMMAND FOR SQLITE
        :return: list OF RESULTS
        """
        if not self.worker:
            self.worker = Thread.run("sqlite db thread", self._worker)

        signal = Signal()
        result = Data()
        self.queue.add((command, result, signal, None))
        signal.wait()
        if result.exception:
            Log.error("Problem with Sqlite call", cause=result.exception)
        return result

    def _worker(self, please_stop):
        global _load_extension_warning_sent

        if DEBUG:
            Log.note("Sqlite version {{version}}", version=sqlite3.sqlite_version)
        if Sqlite.canonical:
            self.db = Sqlite.canonical
        else:
            self.db = sqlite3.connect(coalesce(self.filename, ':memory:'))

            library_loc = File.new_instance(sys.modules[__name__].__file__, "../..")
            full_path = File.new_instance(library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath
            try:
                trace = extract_stack(0)[0]
                if self.upgrade:
                    if os.name == 'nt':
                        file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions.so")
                    else:
                        file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions")

                    full_path = file.abspath
                    self.db.enable_load_extension(True)
                    self.db.execute("SELECT load_extension(" + self.quote_value(full_path) + ")")
            except Exception as e:
                if not _load_extension_warning_sent:
                    _load_extension_warning_sent = True
                    Log.warning("Could not load {{file}}}, doing without. (no SQRT for you!)", file=full_path, cause=e)

        try:
            while not please_stop:
                command, result, signal, trace = self.queue.pop(till=please_stop)

                if DEBUG_INSERT and command.strip().lower().startswith("insert"):
                    Log.note("Running command\n{{command|indent}}", command=command)
                if DEBUG and not command.strip().lower().startswith("insert"):
                    Log.note("Running command\n{{command|indent}}", command=command)
                with Timer("Run command", debug=DEBUG):
                    if signal is not None:
                        try:
                            curr = self.db.execute(command)
                            self.db.commit()
                            result.meta.format = "table"
                            result.header = [d[0] for d in curr.description] if curr.description else None
                            result.data = curr.fetchall()
                            if DEBUG and result.data:
                                text = convert.table2csv(list(result.data))
                                Log.note("Result:\n{{data}}", data=text)
                        except Exception as e:
                            e = Except.wrap(e)
                            result.exception = Except(ERROR, "Problem with\n{{command|indent}}", command=command, cause=e)
                        finally:
                            signal.go()
                    else:
                        try:
                            self.db.execute(command)
                            self.db.commit()
                        except Exception as e:
                            e = Except.wrap(e)
                            e.cause = Except(
                                type=ERROR,
                                template="Bad call to Sqlite",
                                trace=trace
                            )
                            Log.warning("Failure to execute", cause=e)

        except Exception as e:
            if not please_stop:
                Log.error("Problem with sql thread", e)
        finally:
            if DEBUG:
                Log.note("Database is closed")
            self.db.commit()
            self.db.close()

    def quote_column(self, column_name, table=None):
        return quote_column(column_name, table)

    def quote_value(self, value):
        return quote_value(value)
Exemplo n.º 49
0
class ColumnList(Table, jx_base.Container):
    """
    OPTIMIZED FOR THE PARTICULAR ACCESS PATTERNS USED
    """

    def __init__(self, name):
        Table.__init__(self, "meta.columns")
        self.db_file = File("metadata." + name + ".sqlite")
        self.data = {}  # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS)
        self.locker = Lock()
        self._schema = None
        self.db = sqlite3.connect(
            database=self.db_file.abspath, check_same_thread=False, isolation_level=None
        )
        self.last_load = Null
        self.todo = Queue(
            "update columns to db"
        )  # HOLD (action, column) PAIR, WHERE action in ['insert', 'update']
        self._db_load()
        Thread.run("update " + name, self._db_worker)

    @contextmanager
    def _db_transaction(self):
        self.db.execute(str("BEGIN"))
        try:
            yield
            self.db.execute(str("COMMIT"))
        except Exception as e:
            e = Except.wrap(e)
            self.db.execute(str("ROLLBACK"))
            Log.error("Transaction failed", cause=e)

    def _query(self, query):
        result = Data()
        curr = self.db.execute(query)
        result.meta.format = "table"
        result.header = [d[0] for d in curr.description] if curr.description else None
        result.data = curr.fetchall()
        return result

    def _db_create(self):
        with self._db_transaction():
            self.db.execute(
                "CREATE TABLE "
                + db_table_name
                + sql_iso(
                    sql_list(
                        [
                            quote_column(c.name)
                            + " "
                            + json_type_to_sqlite_type[c.jx_type]
                            for c in METADATA_COLUMNS
                        ]
                        + [
                            "PRIMARY KEY"
                            + sql_iso(
                                sql_list(map(quote_column, ["es_index", "es_column"]))
                            )
                        ]
                    )
                )
            )

            for c in METADATA_COLUMNS:
                self._add(c)
                self._db_insert_column(c)

    def _db_load(self):
        self.last_load = Date.now()

        result = self._query(
            SQL_SELECT
            + "name"
            + SQL_FROM
            + "sqlite_master"
            + SQL_WHERE
            + SQL_AND.join(["name=" + db_table_name, "type=" + quote_value("table")])
        )
        if not result.data:
            self._db_create()
            return

        result = self._query(
            SQL_SELECT
            + all_columns
            + SQL_FROM
            + db_table_name
            + SQL_ORDERBY
            + sql_list(map(quote_column, ["es_index", "name", "es_column"]))
        )

        with self.locker:
            for r in result.data:
                c = row_to_column(result.header, r)
                self._add(c)

    def _db_worker(self, please_stop):
        while not please_stop:
            try:
                with self._db_transaction():
                    result = self._query(
                        SQL_SELECT
                        + all_columns
                        + SQL_FROM
                        + db_table_name
                        + SQL_WHERE
                        + "last_updated > "
                        + quote_value(self.last_load)
                        + SQL_ORDERBY
                        + sql_list(map(quote_column, ["es_index", "name", "es_column"]))
                    )

                with self.locker:
                    for r in result.data:
                        c = row_to_column(result.header, r)
                        self._add(c)
                        if c.last_updated > self.last_load:
                            self.last_load = c.last_updated

                updates = self.todo.pop_all()
                DEBUG and updates and Log.note(
                    "{{num}} columns to push to db", num=len(updates)
                )
                for action, column in updates:
                    while not please_stop:
                        try:
                            with self._db_transaction():
                                DEBUG and Log.note(
                                    "{{action}} db for {{table}}.{{column}}",
                                    action=action,
                                    table=column.es_index,
                                    column=column.es_column,
                                )
                                if action is EXECUTE:
                                    self.db.execute(column)
                                elif action is UPDATE:
                                    self.db.execute(
                                        "UPDATE"
                                        + db_table_name
                                        + "SET"
                                        + sql_list(
                                            [
                                                "count=" + quote_value(column.count),
                                                "cardinality="
                                                + quote_value(column.cardinality),
                                                "multi=" + quote_value(column.multi),
                                                "partitions="
                                                + quote_value(
                                                    value2json(column.partitions)
                                                ),
                                                "last_updated="
                                                + quote_value(column.last_updated),
                                            ]
                                        )
                                        + SQL_WHERE
                                        + SQL_AND.join(
                                            [
                                                "es_index = "
                                                + quote_value(column.es_index),
                                                "es_column = "
                                                + quote_value(column.es_column),
                                                "last_updated < "
                                                + quote_value(column.last_updated),
                                            ]
                                        )
                                    )
                                elif action is DELETE:
                                    self.db.execute(
                                        "DELETE FROM"
                                        + db_table_name
                                        + SQL_WHERE
                                        + SQL_AND.join(
                                            [
                                                "es_index = "
                                                + quote_value(column.es_index),
                                                "es_column = "
                                                + quote_value(column.es_column),
                                            ]
                                        )
                                    )
                                else:
                                    self._db_insert_column(column)
                            break
                        except Exception as e:
                            e = Except.wrap(e)
                            if "database is locked" in e:
                                Log.note("metadata database is locked")
                                Till(seconds=1).wait()
                                break
                            else:
                                Log.warning("problem updataing database", cause=e)

            except Exception as e:
                Log.warning("problem updating database", cause=e)

            (Till(seconds=10) | please_stop).wait()

    def _db_insert_column(self, column):
        try:
            self.db.execute(
                "INSERT INTO"
                + db_table_name
                + sql_iso(all_columns)
                + "VALUES"
                + sql_iso(
                    sql_list(
                        [
                            quote_value(column[c.name])
                            if c.name not in ("nested_path", "partitions")
                            else quote_value(value2json(column[c.name]))
                            for c in METADATA_COLUMNS
                        ]
                    )
                )
            )
        except Exception as e:
            e = Except.wrap(e)
            if "UNIQUE constraint failed" in e or " are not unique" in e:
                # THIS CAN HAPPEN BECAUSE todo HAS OLD COLUMN DATA
                self.todo.add((UPDATE, column), force=True)
            else:
                Log.error("do not know how to handle", cause=e)

    def __copy__(self):
        output = object.__new__(ColumnList)
        Table.__init__(output, "meta.columns")
        output.data = {
            t: {c: list(cs) for c, cs in dd.items()} for t, dd in self.data.items()
        }
        output.locker = Lock()
        output._schema = None
        return output

    def find(self, es_index, abs_column_name=None):
        with self.locker:
            if es_index.startswith("meta."):
                self._update_meta()

            if not abs_column_name:
                return [c for cs in self.data.get(es_index, {}).values() for c in cs]
            else:
                return self.data.get(es_index, {}).get(abs_column_name, [])

    def extend(self, columns):
        self.dirty = True
        with self.locker:
            for column in columns:
                self._add(column)

    def add(self, column):
        self.dirty = True
        with self.locker:
            canonical = self._add(column)
        if canonical == None:
            return column  # ALREADY ADDED
        self.todo.add((INSERT if canonical is column else UPDATE, canonical))
        return canonical

    def remove_table(self, table_name):
        del self.data[table_name]

    def _add(self, column):
        """
        :param column: ANY COLUMN OBJECT
        :return:  None IF column IS canonical ALREADY (NET-ZERO EFFECT)
        """
        columns_for_table = self.data.setdefault(column.es_index, {})
        existing_columns = columns_for_table.setdefault(column.name, [])

        for canonical in existing_columns:
            if canonical is column:
                return None
            if canonical.es_type == column.es_type:
                if column.last_updated > canonical.last_updated:
                    for key in Column.__slots__:
                        old_value = canonical[key]
                        new_value = column[key]
                        if new_value == None:
                            pass  # DO NOT BOTHER CLEARING OLD VALUES (LIKE cardinality AND paritiions)
                        elif new_value == old_value:
                            pass  # NO NEED TO UPDATE WHEN NO CHANGE MADE (COMMON CASE)
                        else:
                            canonical[key] = new_value
                return canonical
        existing_columns.append(column)
        return column

    def _update_meta(self):
        if not self.dirty:
            return

        for mcl in self.data.get("meta.columns").values():
            for mc in mcl:
                count = 0
                values = set()
                objects = 0
                multi = 1
                for column in self._all_columns():
                    value = column[mc.name]
                    if value == None:
                        pass
                    else:
                        count += 1
                        if is_list(value):
                            multi = max(multi, len(value))
                            try:
                                values |= set(value)
                            except Exception:
                                objects += len(value)
                        elif is_data(value):
                            objects += 1
                        else:
                            values.add(value)
                mc.count = count
                mc.cardinality = len(values) + objects
                mc.partitions = jx.sort(values)
                mc.multi = multi
                mc.last_updated = Date.now()
        self.dirty = False

    def _all_columns(self):
        return [
            column
            for t, cs in self.data.items()
            for _, css in cs.items()
            for column in css
        ]

    def __iter__(self):
        with self.locker:
            self._update_meta()
            return iter(self._all_columns())

    def __len__(self):
        return self.data["meta.columns"]["es_index"].count

    def update(self, command):
        self.dirty = True
        try:
            command = wrap(command)
            DEBUG and Log.note(
                "Update {{timestamp}}: {{command|json}}",
                command=command,
                timestamp=Date(command["set"].last_updated),
            )
            eq = command.where.eq
            if eq.es_index:
                if len(eq) == 1:
                    if unwraplist(command.clear) == ".":
                        with self.locker:
                            del self.data[eq.es_index]
                        self.todo.add(
                            (
                                EXECUTE,
                                "DELETE FROM "
                                + db_table_name
                                + SQL_WHERE
                                + " es_index="
                                + quote_value(eq.es_index),
                            )
                        )
                        return

                    # FASTEST
                    all_columns = self.data.get(eq.es_index, {}).values()
                    with self.locker:
                        columns = [c for cs in all_columns for c in cs]
                elif eq.es_column and len(eq) == 2:
                    # FASTER
                    all_columns = self.data.get(eq.es_index, {}).values()
                    with self.locker:
                        columns = [
                            c
                            for cs in all_columns
                            for c in cs
                            if c.es_column == eq.es_column
                        ]

                else:
                    # SLOWER
                    all_columns = self.data.get(eq.es_index, {}).values()
                    with self.locker:
                        columns = [
                            c
                            for cs in all_columns
                            for c in cs
                            if all(
                                c[k] == v for k, v in eq.items()
                            )  # THIS LINE IS VERY SLOW
                        ]
            else:
                columns = list(self)
                columns = jx.filter(columns, command.where)

            with self.locker:
                for col in columns:
                    DEBUG and Log.note(
                        "update column {{table}}.{{column}}",
                        table=col.es_index,
                        column=col.es_column,
                    )
                    for k in command["clear"]:
                        if k == ".":
                            self.todo.add((DELETE, col))
                            lst = self.data[col.es_index]
                            cols = lst[col.name]
                            cols.remove(col)
                            if len(cols) == 0:
                                del lst[col.name]
                                if len(lst) == 0:
                                    del self.data[col.es_index]
                            break
                        else:
                            col[k] = None
                    else:
                        # DID NOT DELETE COLUMNM ("."), CONTINUE TO SET PROPERTIES
                        for k, v in command.set.items():
                            col[k] = v
                        self.todo.add((UPDATE, col))

        except Exception as e:
            Log.error("should not happen", cause=e)

    def query(self, query):
        # NOT EXPECTED TO BE RUN
        Log.error("not")
        with self.locker:
            self._update_meta()
            if not self._schema:
                self._schema = Schema(
                    ".", [c for cs in self.data["meta.columns"].values() for c in cs]
                )
            snapshot = self._all_columns()

        from jx_python.containers.list_usingPythonList import ListContainer

        query.frum = ListContainer("meta.columns", snapshot, self._schema)
        return jx.run(query)

    def groupby(self, keys):
        with self.locker:
            self._update_meta()
            return jx.groupby(self.__iter__(), keys)

    @property
    def schema(self):
        if not self._schema:
            with self.locker:
                self._update_meta()
                self._schema = Schema(
                    ".", [c for cs in self.data["meta.columns"].values() for c in cs]
                )
        return self._schema

    @property
    def namespace(self):
        return self

    def get_table(self, table_name):
        if table_name != "meta.columns":
            Log.error("this container has only the meta.columns")
        return self

    def denormalized(self):
        """
        THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM
        THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES
        """
        with self.locker:
            self._update_meta()
            output = [
                {
                    "table": c.es_index,
                    "name": untype_path(c.name),
                    "cardinality": c.cardinality,
                    "es_column": c.es_column,
                    "es_index": c.es_index,
                    "last_updated": c.last_updated,
                    "count": c.count,
                    "nested_path": [unnest_path(n) for n in c.nested_path],
                    "es_type": c.es_type,
                    "type": c.jx_type,
                }
                for tname, css in self.data.items()
                for cname, cs in css.items()
                for c in cs
                if c.jx_type not in STRUCT  # and c.es_column != "_id"
            ]

        from jx_python.containers.list_usingPythonList import ListContainer

        return ListContainer(
            self.name,
            data=output,
            schema=jx_base.Schema("meta.columns", SIMPLE_METADATA_COLUMNS),
        )
class StructuredLogger_usingElasticSearch(StructuredLogger):
    @override
    def __init__(self, host, index, port=9200, type="log", max_size=1000, batch_size=100, kwargs=None):
        """
        settings ARE FOR THE ELASTICSEARCH INDEX
        """
        self.es = Cluster(kwargs).get_or_create_index(
            schema=mo_json.json2value(value2json(SCHEMA), leaves=True),
            limit_replicas=True,
            tjson=True,
            kwargs=kwargs
        )
        self.batch_size = batch_size
        self.es.add_alias(coalesce(kwargs.alias, kwargs.index))
        self.queue = Queue("debug logs to es", max=max_size, silent=True)
        self.es.settings.retry.times = coalesce(self.es.settings.retry.times, 3)
        self.es.settings.retry.sleep = Duration(coalesce(self.es.settings.retry.sleep, MINUTE))
        Thread.run("add debug logs to es", self._insert_loop)

    def write(self, template, params):
        if params.get("template"):
            # DETECTED INNER TEMPLATE, ASSUME TRACE IS ON, SO DO NOT NEED THE OUTER TEMPLATE
            self.queue.add({"value": params})
        else:
            template = strings.limit(template, 2000)
            self.queue.add({"value": {"template": template, "params": params}}, timeout=3 * MINUTE)
        return self

    def _insert_loop(self, please_stop=None):
        bad_count = 0
        while not please_stop:
            try:
                Till(seconds=1).wait()
                messages = wrap(self.queue.pop_all())
                if not messages:
                    continue

                for g, mm in jx.groupby(messages, size=self.batch_size):
                    scrubbed = []
                    try:
                        for i, message in enumerate(mm):
                            if message is THREAD_STOP:
                                please_stop.go()
                                return
                            scrubbed.append(_deep_json_to_string(message, depth=3))
                    finally:
                        self.es.extend(scrubbed)
                    bad_count = 0
            except Exception as e:
                Log.warning("Problem inserting logs into ES", cause=e)
                bad_count += 1
                if bad_count > MAX_BAD_COUNT:
                    Log.warning("Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index)
                Till(seconds=30).wait()

        # CONTINUE TO DRAIN THIS QUEUE
        while not please_stop:
            try:
                Till(seconds=1).wait()
                self.queue.pop_all()
            except Exception as e:
                Log.warning("Should not happen", cause=e)

    def stop(self):
        with suppress_exception:
            self.queue.add(THREAD_STOP)  # BE PATIENT, LET REST OF MESSAGE BE SENT

        with suppress_exception:
            self.queue.close()
Exemplo n.º 51
0
class Clogger:

    # Singleton of the look-ahead scanner Clogger
    SINGLE_CLOGGER = None
    def __new__(cls, *args, **kwargs):
        if cls.SINGLE_CLOGGER is None:
            cls.SINGLE_CLOGGER = object.__new__(cls)
        return cls.SINGLE_CLOGGER


    def __init__(self, conn=None, tuid_service=None, start_workers=True, new_table=False, kwargs=None):
        try:
            self.config = kwargs
            self.conn = conn if conn else sql.Sql(self.config.database.name)
            self.hg_cache = HgMozillaOrg(kwargs=self.config.hg_cache, use_cache=True) if self.config.hg_cache else Null

            self.tuid_service = tuid_service if tuid_service else tuid.service.TUIDService(
                kwargs=self.config.tuid, conn=self.conn, clogger=self
            )
            self.rev_locker = Lock()
            self.working_locker = Lock()

            if new_table:
                with self.conn.transaction() as t:
                    t.execute("DROP TABLE IF EXISTS csetLog")

            self.init_db()
            self.next_revnum = coalesce(self.conn.get_one("SELECT max(revnum)+1 FROM csetLog")[0], 1)
            self.csets_todo_backwards = Queue(name="Clogger.csets_todo_backwards")
            self.deletions_todo = Queue(name="Clogger.deletions_todo")
            self.maintenance_signal = Signal(name="Clogger.maintenance_signal")

            if 'tuid' in self.config:
                self.config = self.config.tuid

            self.disable_backfilling = False
            self.disable_tipfilling = False
            self.disable_deletion = False
            self.disable_maintenance = False

            self.backfill_thread = None
            self.tipfill_thread = None
            self.deletion_thread = None
            self.maintenance_thread = None

            # Make sure we are filled before allowing queries
            numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0]
            if numrevs < MINIMUM_PERMANENT_CSETS:
                Log.note("Filling in csets to hold {{minim}} csets.", minim=MINIMUM_PERMANENT_CSETS)
                oldest_rev = 'tip'
                with self.conn.transaction() as t:
                    tmp = t.query("SELECT min(revnum), revision FROM csetLog").data[0][1]
                    if tmp:
                        oldest_rev = tmp
                self._fill_in_range(
                    MINIMUM_PERMANENT_CSETS - numrevs,
                    oldest_rev,
                    timestamp=False
                )

            Log.note(
                "Table is filled with atleast {{minim}} entries.",
                minim=MINIMUM_PERMANENT_CSETS
            )

            if start_workers:
                self.start_workers()
        except Exception as e:
            Log.warning("Cannot setup clogger: {{cause}}", cause=str(e))


    def start_backfilling(self):
        if not self.backfill_thread:
            self.backfill_thread = Thread.run('clogger-backfill', self.fill_backward_with_list)


    def start_tipfillling(self):
        if not self.tipfill_thread:
            self.tipfill_thread = Thread.run('clogger-tip', self.fill_forward_continuous)


    def start_maintenance(self):
        if not self.maintenance_thread:
            self.maintenance_thread = Thread.run('clogger-maintenance', self.csetLog_maintenance)


    def start_deleter(self):
        if not self.deletion_thread:
            self.deletion_thread = Thread.run('clogger-deleter', self.csetLog_deleter)


    def start_workers(self):
        self.start_tipfillling()
        self.start_backfilling()
        self.start_maintenance()
        self.start_deleter()
        Log.note("Started clogger workers.")


    def init_db(self):
        with self.conn.transaction() as t:
            t.execute('''
            CREATE TABLE IF NOT EXISTS csetLog (
                revnum         INTEGER PRIMARY KEY,
                revision       CHAR(12) NOT NULL,
                timestamp      INTEGER
            );''')


    def disable_all(self):
        self.disable_tipfilling = True
        self.disable_backfilling = True
        self.disable_maintenance = True
        self.disable_deletion = True


    def revnum(self):
        """
        :return: max revnum that was added
        """
        return coalesce(self.conn.get_one("SELECT max(revnum) as revnum FROM csetLog")[0], 0)


    def get_tip(self, transaction):
        return transaction.get_one(
            "SELECT max(revnum) as revnum, revision FROM csetLog"
        )


    def get_tail(self, transaction):
        return transaction.get_one(
            "SELECT min(revnum) as revnum, revision FROM csetLog"
        )


    def _get_clog(self, clog_url):
        try:
            Log.note("Searching through changelog {{url}}", url=clog_url)
            clog_obj = http.get_json(clog_url, retry=RETRY)
            return clog_obj
        except Exception as e:
            Log.error(
                "Unexpected error getting changset-log for {{url}}: {{error}}",
                url=clog_url,
                error=e
            )


    def _get_one_revision(self, transaction, cset_entry):
        # Returns a single revision if it exists
        _, rev, _ = cset_entry
        return transaction.get_one("SELECT revision FROM csetLog WHERE revision=?", (rev,))


    def _get_one_revnum(self, transaction, rev):
        # Returns a single revnum if it exists
        return transaction.get_one("SELECT revnum FROM csetLog WHERE revision=?", (rev,))


    def _get_revnum_range(self, transaction, revnum1, revnum2):
        # Returns a range of revision numbers (that is inclusive)
        high_num = max(revnum1, revnum2)
        low_num = min(revnum1, revnum2)

        return transaction.query(
            "SELECT revnum, revision FROM csetLog WHERE "
            "revnum >= " + str(low_num) + " AND revnum <= " + str(high_num)
        ).data


    def recompute_table_revnums(self):
        '''
        Recomputes the revnums for the csetLog table
        by creating a new table, and copying csetLog to
        it. The INTEGER PRIMARY KEY in the temp table auto increments
        as rows are added.

        IMPORTANT: Only call this after acquiring the
                   lock `self.working_locker`.
        :return:
        '''
        with self.conn.transaction() as t:
            t.execute('''
            CREATE TABLE temp (
                revnum         INTEGER PRIMARY KEY,
                revision       CHAR(12) NOT NULL,
                timestamp      INTEGER
            );''')

            t.execute(
                "INSERT INTO temp (revision, timestamp) "
                "SELECT revision, timestamp FROM csetlog ORDER BY revnum ASC"
            )

            t.execute("DROP TABLE csetLog;")
            t.execute("ALTER TABLE temp RENAME TO csetLog;")


    def check_for_maintenance(self):
        '''
        Returns True if the maintenance worker should be run now,
        and False otherwise.
        :return:
        '''
        numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0]
        Log.note("Number of csets in csetLog table: {{num}}", num=numrevs)
        if numrevs >= SIGNAL_MAINTENANCE_CSETS:
            return True
        return False


    def add_cset_entries(self, ordered_rev_list, timestamp=False, number_forward=True):
        '''
        Adds a list of revisions to the table. Assumes ordered_rev_list is an ordered
        based on how changesets are found in the changelog. Going forwards or backwards is dealt
        with by flipping the list
        :param ordered_cset_list: Order given from changeset log searching.
        :param timestamp: If false, records are kept indefinitely
                          but if holes exist: (delete, None, delete, None)
                          those delete's with None's around them
                          will not be deleted.
        :param numbered: If True, this function will number the revision list
                         by going forward from max(revNum), else it'll go backwards
                         from revNum, then add X to all revnums and self.next_revnum
                         where X is the length of ordered_rev_list
        :return:
        '''
        with self.conn.transaction() as t:
            current_min = t.get_one("SELECT min(revnum) FROM csetlog")[0]
            current_max = t.get_one("SELECT max(revnum) FROM csetlog")[0]
            if not current_min or not current_max:
                current_min = 0
                current_max = 0

            direction = -1
            start = current_min - 1
            if number_forward:
                direction = 1
                start = current_max + 1
                ordered_rev_list = ordered_rev_list[::-1]

            insert_list = [
                (
                    start + direction * count,
                    rev,
                    int(time.time()) if timestamp else -1
                )
                for count, rev in enumerate(ordered_rev_list)
            ]

            # In case of overlapping requests
            fmt_insert_list = []
            for cset_entry in insert_list:
                tmp = self._get_one_revision(t, cset_entry)
                if not tmp:
                    fmt_insert_list.append(cset_entry)

            for _, tmp_insert_list in jx.groupby(fmt_insert_list, size=SQL_CSET_BATCH_SIZE):
                t.execute(
                    "INSERT INTO csetLog (revnum, revision, timestamp)" +
                    " VALUES " +
                    sql_list(
                        quote_set((revnum, revision, timestamp))
                        for revnum, revision, timestamp in tmp_insert_list
                    )
                )

            # Move the revision numbers forward if needed
            self.recompute_table_revnums()

        # Start a maintenance run if needed
        if self.check_for_maintenance():
            Log.note("Scheduling maintenance run on clogger.")
            self.maintenance_signal.go()


    def _fill_in_range(self, parent_cset, child_cset, timestamp=False, number_forward=True):
        '''
        Fills cset logs in a certain range. 'parent_cset' can be an int and in that case,
        we get that many changesets instead. If parent_cset is an int, then we consider
        that we are going backwards (number_forward is False) and we ignore the first
        changeset of the first log, and we ignore the setting for number_forward.
        Otherwise, we continue until we find the given 'parent_cset'.
        :param parent_cset:
        :param child_cset:
        :param timestamp:
        :param number_forward:
        :return:
        '''
        csets_to_add = []
        found_parent = False
        find_parent = False
        if type(parent_cset) != int:
            find_parent = True
        elif parent_cset >= MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG:
            Log.warning(
                "Requested number of new changesets {{num}} is too high. "
                "Max number that can be requested is {{maxnum}}.",
                num=parent_cset,
                maxnum=MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG
            )
            return None

        csets_found = 0
        clogs_seen = 0
        final_rev = child_cset
        while not found_parent and clogs_seen < MAX_BACKFILL_CLOGS:
            clog_url = str(HG_URL) + "/" + self.config.hg.branch + "/json-log/" + final_rev
            clog_obj = self._get_clog(clog_url)
            clog_csets_list = list(clog_obj['changesets'])
            for clog_cset in clog_csets_list[:-1]:
                if not number_forward and csets_found <= 0:
                    # Skip this entry it already exists
                    csets_found += 1
                    continue

                nodes_cset = clog_cset['node'][:12]
                if find_parent:
                    if nodes_cset == parent_cset:
                        found_parent = True
                        if not number_forward:
                            # When going forward this entry is
                            # the given parent
                            csets_to_add.append(nodes_cset)
                        break
                else:
                    if csets_found + 1 > parent_cset:
                        found_parent = True
                        if not number_forward:
                            # When going forward this entry is
                            # the given parent (which is supposed
                            # to already exist)
                            csets_to_add.append(nodes_cset)
                        break
                    csets_found += 1
                csets_to_add.append(nodes_cset)
            if found_parent == True:
                break

            clogs_seen += 1
            final_rev = clog_csets_list[-1]['node'][:12]

        if found_parent:
            self.add_cset_entries(csets_to_add, timestamp=timestamp, number_forward=number_forward)
        else:
            Log.warning(
                "Couldn't find the end of the request for {{request}}. "
                "Max number that can be requested through _fill_in_range is {{maxnum}}.",
                request={
                    'parent_cset': parent_cset,
                    'child_cset':child_cset,
                    'number_forward': number_forward
                },
                maxnum=MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG
            )
            return None
        return csets_to_add


    def initialize_to_range(self, old_rev, new_rev, delete_old=True):
        '''
        Used in service testing to get to very old
        changesets quickly.
        :param old_rev: The oldest revision to keep
        :param new_rev: The revision to start searching from
        :return:
        '''
        old_settings = [
            self.disable_tipfilling,
            self.disable_backfilling,
            self.disable_maintenance,
            self.disable_deletion
        ]
        self.disable_tipfilling = True
        self.disable_backfilling = True
        self.disable_maintenance = True
        self.disable_deletion = True

        old_rev = old_rev[:12]
        new_rev = new_rev[:12]

        with self.working_locker:
            if delete_old:
                with self.conn.transaction() as t:
                    t.execute("DELETE FROM csetLog")
            with self.conn.transaction() as t:
                t.execute(
                    "INSERT INTO csetLog (revision, timestamp) VALUES " +
                    quote_set((new_rev, -1))
                )
            self._fill_in_range(old_rev, new_rev, timestamp=True, number_forward=False)

        self.disable_tipfilling = old_settings[0]
        self.disable_backfilling = old_settings[1]
        self.disable_maintenance = old_settings[2]
        self.disable_deletion = old_settings[3]


    def fill_backward_with_list(self, please_stop=None):
        '''
        Expects requests of the tuple form: (parent_cset, timestamp)
        parent_cset can be an int X to go back by X changesets, or
        a string to search for going backwards in time. If timestamp
        is false, no timestamps will be added to the entries.
        :param please_stop:
        :return:
        '''
        while not please_stop:
            try:
                request = self.csets_todo_backwards.pop(till=please_stop)
                if please_stop:
                    break

                # If backfilling is disabled, all requests
                # are ignored.
                if self.disable_backfilling:
                    Till(till=CSET_BACKFILL_WAIT_TIME).wait()
                    continue

                if request:
                    parent_cset, timestamp = request
                else:
                    continue

                with self.working_locker:
                    with self.conn.transaction() as t:
                        parent_revnum = self._get_one_revnum(t, parent_cset)
                    if parent_revnum:
                        continue

                    with self.conn.transaction() as t:
                        _, oldest_revision = self.get_tail(t)

                    self._fill_in_range(
                        parent_cset,
                        oldest_revision,
                        timestamp=timestamp,
                        number_forward=False
                    )
                Log.note("Finished {{cset}}", cset=parent_cset)
            except Exception as e:
                Log.warning("Unknown error occurred during backfill: ", cause=e)


    def update_tip(self):
        '''
        Returns False if the tip is already at the newest, or True
        if an update has taken place.
        :return:
        '''
        clog_obj = self._get_clog(
            str(HG_URL) + "/" + self.config.hg.branch + "/json-log/tip"
        )

        # Get current tip in DB
        with self.conn.transaction() as t:
            _, newest_known_rev = self.get_tip(t)

        # If we are still at the newest, wait for CSET_TIP_WAIT_TIME seconds
        # before checking again.
        first_clog_entry = clog_obj['changesets'][0]['node'][:12]
        if newest_known_rev == first_clog_entry:
            return False

        csets_to_gather = None
        if not newest_known_rev:
            Log.note(
                "No revisions found in table, adding {{minim}} entries...",
                minim=MINIMUM_PERMANENT_CSETS
            )
            csets_to_gather = MINIMUM_PERMANENT_CSETS

        found_newest_known = False
        csets_to_add = []
        csets_found = 0
        clogs_seen = 0
        Log.note("Found new revisions. Updating csetLog tip to {{rev}}...", rev=first_clog_entry)
        while not found_newest_known and clogs_seen < MAX_TIPFILL_CLOGS:
            clog_csets_list = list(clog_obj['changesets'])
            for clog_cset in clog_csets_list[:-1]:
                nodes_cset = clog_cset['node'][:12]
                if not csets_to_gather:
                    if nodes_cset == newest_known_rev:
                        found_newest_known = True
                        break
                else:
                    if csets_found >= csets_to_gather:
                        found_newest_known = True
                        break
                csets_found += 1
                csets_to_add.append(nodes_cset)
            if not found_newest_known:
                # Get the next page
                clogs_seen += 1
                final_rev = clog_csets_list[-1]['node'][:12]
                clog_url = str(HG_URL) + "/" + self.config.hg.branch + "/json-log/" + final_rev
                clog_obj = self._get_clog(clog_url)

        if clogs_seen >= MAX_TIPFILL_CLOGS:
            Log.error(
                "Too many changesets, can't find last tip or the number is too high: {{rev}}. "
                "Maximum possible to request is {{maxnum}}",
                rev=coalesce(newest_known_rev, csets_to_gather),
                maxnum=MAX_TIPFILL_CLOGS * CHANGESETS_PER_CLOG
            )
            return False

        with self.working_locker:
            Log.note("Adding {{csets}}", csets=csets_to_add)
            self.add_cset_entries(csets_to_add, timestamp=False)
        return True


    def fill_forward_continuous(self, please_stop=None):
        while not please_stop:
            try:
                while not please_stop and not self.disable_tipfilling and self.update_tip():
                    pass
                (please_stop | Till(seconds=CSET_TIP_WAIT_TIME)).wait()
            except Exception as e:
                Log.warning("Unknown error occurred during tip filling:", cause=e)


    def csetLog_maintenance(self, please_stop=None):
        '''
        Handles deleting old csetLog entries and timestamping
        revisions once they pass the length for permanent
        storage for deletion later.
        :param please_stop:
        :return:
        '''
        while not please_stop:
            try:
                # Wait until something signals the maintenance cycle
                # to begin (or end).
                (self.maintenance_signal | please_stop).wait()

                if please_stop:
                    break
                if self.disable_maintenance:
                    continue

                Log.warning(
                    "Starting clog maintenance. Since this doesn't start often, "
                    "we need to explicitly see when it's started with this warning."
                )

                # Reset signal so we don't request
                # maintenance infinitely.
                with self.maintenance_signal.lock:
                    self.maintenance_signal._go = False

                with self.working_locker:
                    all_data = None
                    with self.conn.transaction() as t:
                        all_data = sorted(
                            t.get("SELECT revnum, revision, timestamp FROM csetLog"),
                            key=lambda x: int(x[0])
                        )

                    # Restore maximum permanents (if overflowing)
                    new_data = []
                    modified = False
                    for count, (revnum, revision, timestamp) in enumerate(all_data[::-1]):
                        if count < MINIMUM_PERMANENT_CSETS:
                            if timestamp != -1:
                                modified = True
                                new_data.append((revnum, revision, -1))
                            else:
                                new_data.append((revnum, revision, timestamp))
                        elif type(timestamp) != int or timestamp == -1:
                            modified = True
                            new_data.append((revnum, revision, int(time.time())))
                        else:
                            new_data.append((revnum, revision, timestamp))

                    # Delete annotations at revisions with timestamps
                    # that are too old. The csetLog entries will have
                    # their timestamps reset here.
                    new_data1 = []
                    annrevs_to_del = []
                    current_time = time.time()
                    for count, (revnum, revision, timestamp) in enumerate(new_data[::-1]):
                        new_timestamp = timestamp
                        if timestamp != -1:
                            if current_time >= timestamp + TIME_TO_KEEP_ANNOTATIONS.seconds:
                                modified = True
                                new_timestamp = current_time
                                annrevs_to_del.append(revision)
                        new_data1.append((revnum, revision, new_timestamp))

                    if len(annrevs_to_del) > 0:
                        # Delete any latestFileMod and annotation entries
                        # that are too old.
                        Log.note(
                            "Deleting annotations and latestFileMod for revisions for being "
                            "older than {{oldest}}: {{revisions}}",
                            oldest=TIME_TO_KEEP_ANNOTATIONS,
                            revisions=annrevs_to_del
                        )
                        with self.conn.transaction() as t:
                            t.execute(
                                "DELETE FROM latestFileMod WHERE revision IN " +
                                quote_set(annrevs_to_del)
                            )
                            t.execute(
                                "DELETE FROM annotations WHERE revision IN " +
                                quote_set(annrevs_to_del)
                            )

                    # Delete any overflowing entries
                    new_data2 = new_data1
                    reved_all_data = all_data[::-1]
                    deleted_data = reved_all_data[MAXIMUM_NONPERMANENT_CSETS:]
                    delete_overflowing_revstart = None
                    if len(deleted_data) > 0:
                        _, delete_overflowing_revstart, _ = deleted_data[0]
                        new_data2 = set(all_data) - set(deleted_data)

                        # Update old frontiers if requested, otherwise
                        # they will all get deleted by the csetLog_deleter
                        # worker
                        if UPDATE_VERY_OLD_FRONTIERS:
                            _, max_revision, _ = all_data[-1]
                            for _, revision, _ in deleted_data:
                                with self.conn.transaction() as t:
                                    old_files = t.get(
                                        "SELECT file FROM latestFileMod WHERE revision=?",
                                        (revision,)
                                    )
                                if old_files is None or len(old_files) <= 0:
                                    continue

                                self.tuid_service.get_tuids_from_files(
                                    old_files,
                                    max_revision,
                                    going_forward=True,
                                )

                                still_exist = True
                                while still_exist and not please_stop:
                                    Till(seconds=TUID_EXISTENCE_WAIT_TIME).wait()
                                    with self.conn.transaction() as t:
                                        old_files = t.get(
                                            "SELECT file FROM latestFileMod WHERE revision=?",
                                            (revision,)
                                        )
                                    if old_files is None or len(old_files) <= 0:
                                        still_exist = False

                    # Update table and schedule a deletion
                    if modified:
                        with self.conn.transaction() as t:
                            insert_into_db_chunked(
                                t,
                                new_data2,
                                "INSERT OR REPLACE INTO csetLog (revnum, revision, timestamp) VALUES "
                            )
                    if not deleted_data:
                        continue

                    Log.note("Scheduling {{num_csets}} for deletion", num_csets=len(deleted_data))
                    self.deletions_todo.add(delete_overflowing_revstart)
            except Exception as e:
                Log.warning("Unexpected error occured while maintaining csetLog, continuing to try: ", cause=e)
        return


    def csetLog_deleter(self, please_stop=None):
        '''
        Deletes changesets from the csetLog table
        and also changesets from the annotation table
        that have revisions matching the given changesets.
        Accepts lists of csets from self.deletions_todo.
        :param please_stop:
        :return:
        '''
        while not please_stop:
            try:
                request = self.deletions_todo.pop(till=please_stop)
                if please_stop:
                    break

                # If deletion is disabled, ignore the current
                # request - it will need to be re-requested.
                if self.disable_deletion:
                    Till(till=CSET_DELETION_WAIT_TIME).wait()
                    continue

                with self.working_locker:
                    first_cset = request

                    # Since we are deleting and moving stuff around in the
                    # TUID tables, we need everything to be contained in
                    # one transaction with no interruptions.
                    with self.conn.transaction() as t:
                        revnum = self._get_one_revnum(t, first_cset)[0]
                        csets_to_del = t.get(
                            "SELECT revnum, revision FROM csetLog WHERE revnum <= ?", (revnum,)
                        )
                        csets_to_del = [cset for _, cset in csets_to_del]
                        existing_frontiers = t.query(
                            "SELECT revision FROM latestFileMod WHERE revision IN " +
                            quote_set(csets_to_del)
                        ).data

                        existing_frontiers = [existing_frontiers[i][0] for i, _ in enumerate(existing_frontiers)]
                        Log.note(
                            "Deleting all annotations and changeset log entries with revisions in the list: {{csets}}",
                            csets=csets_to_del
                        )

                        if len(existing_frontiers) > 0:
                            # This handles files which no longer exist anymore in
                            # the main branch.
                            Log.note(
                                "Deleting existing frontiers for revisions: {{revisions}}",
                                revisions=existing_frontiers
                            )
                            t.execute(
                                "DELETE FROM latestFileMod WHERE revision IN " +
                                quote_set(existing_frontiers)
                            )

                        Log.note("Deleting annotations...")
                        t.execute(
                            "DELETE FROM annotations WHERE revision IN " +
                            quote_set(csets_to_del)
                        )

                        Log.note(
                            "Deleting {{num_entries}} csetLog entries...",
                            num_entries=len(csets_to_del)
                        )
                        t.execute(
                            "DELETE FROM csetLog WHERE revision IN " +
                            quote_set(csets_to_del)
                        )

                    # Recalculate the revnums
                    self.recompute_table_revnums()
            except Exception as e:
                Log.warning("Unexpected error occured while deleting from csetLog:", cause=e)
                Till(seconds=CSET_DELETION_WAIT_TIME).wait()
        return


    def get_old_cset_revnum(self, revision):
        self.csets_todo_backwards.add((revision, True))

        revnum = None
        timeout = Till(seconds=BACKFILL_REVNUM_TIMEOUT)
        while not timeout:
            with self.conn.transaction() as t:
                revnum = self._get_one_revnum(t, revision)

            if revnum and revnum[0] >= 0:
                break
            elif revnum[0] < 0:
                Log.note("Waiting for table to recompute...")
            else:
                Log.note("Waiting for backfill to complete...")
            Till(seconds=CSET_BACKFILL_WAIT_TIME).wait()

        if timeout:
            Log.error(
                "Cannot find revision {{rev}} after waiting {{timeout}} seconds",
                rev=revision,
                timeout=BACKFILL_REVNUM_TIMEOUT
            )
        return revnum


    def get_revnnums_from_range(self, revision1, revision2):
        with self.conn.transaction() as t:
            revnum1 = self._get_one_revnum(t, revision1)
            revnum2 = self._get_one_revnum(t, revision2)
        if not revnum1 or not revnum2:
            did_an_update = self.update_tip()
            if did_an_update:
                with self.conn.transaction() as t:
                    revnum1 = self._get_one_revnum(t, revision1)
                    revnum2 = self._get_one_revnum(t, revision2)

            if not revnum1:
                revnum1 = self.get_old_cset_revnum(revision1)
                # Refresh the second entry
                with self.conn.transaction() as t:
                    revnum2 = self._get_one_revnum(t, revision2)

            if not revnum2:
                revnum2 = self.get_old_cset_revnum(revision2)

                # The first revnum might change also
                with self.conn.transaction() as t:
                    revnum1 = self._get_one_revnum(t, revision1)

        with self.conn.transaction() as t:
            result = self._get_revnum_range(t, revnum1[0], revnum2[0])
        return sorted(
            result,
            key=lambda x: int(x[0])
        )
Exemplo n.º 52
0
class ElasticsearchMetadata(Namespace):
    """
    MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER
    """

    @override
    def __new__(cls, kwargs, *args, **_kwargs):
        es_cluster = elasticsearch.Cluster(kwargs)
        output = known_clusters.get(id(es_cluster))
        if output is None:
            output = object.__new__(cls)
            known_clusters[id(es_cluster)] = output
        return output

    @override
    def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)

        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.index_to_alias = Relation_usingList()

        self.es_metadata = Null
        self.metadata_last_updated = Date.now() - OLD_METADATA

        self.meta = Data()
        self.meta.columns = ColumnList()

        self.alias_to_query_paths = {
            "meta.columns": [['.']],
            "meta.tables": [['.']]
        }
        self.alias_last_updated = {
            "meta.columns": Date.now(),
            "meta.tables": Date.now()
        }
        table_columns = metadata_tables()
        self.meta.tables = ListContainer(
            "meta.tables",
            [
                # TableDesc("meta.columns", None, ".", Date.now()),
                # TableDesc("meta.tables", None, ".", Date.now())
            ],
            jx_base.Schema(".", table_columns)
        )
        self.meta.columns.extend(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return

    @property
    def namespace(self):
        return self.meta.columns.namespace

    @property
    def url(self):
        return self.es_cluster.url / self.default_name.replace(".", "/")

    def _reload_columns(self, table_desc):
        """
        :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS)
        :return:
        """
        # FIND ALL INDEXES OF ALIAS
        es_last_updated = self.es_cluster.metatdata_last_updated

        alias = table_desc.name
        canonical_index = self.es_cluster.get_best_matching_index(alias).index
        update_required = not (table_desc.timestamp < es_last_updated)
        metadata = self.es_cluster.get_metadata(force=update_required)

        indexes = self.index_to_alias.get_domain(alias)
        props = [
            (self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties)
            for i, d in metadata.indices.items()
            if i in indexes
            for t, m in [_get_best_type_from_mapping(d.mappings)]
        ]

        # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT
        dirty = False
        all_comparisions = list(jx.pairwise(props)) + list(jx.pairwise(jx.reverse(props)))
        # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE
        for (i1, t1, p1), (i2, t2, p2) in all_comparisions:
            diff = elasticsearch.diff_schema(p2, p1)
            if not self.settings.read_only:
                for d in diff:
                    dirty = True
                    i1.add_property(*d)
        meta = self.es_cluster.get_metadata(force=dirty).indices[canonical_index]

        data_type, mapping = _get_best_type_from_mapping(meta.mappings)
        mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"}
        self._parse_properties(alias, mapping, meta)
        table_desc.timestamp = es_last_updated

    def _parse_properties(self, alias, mapping, meta):
        abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties)
        if any(c.cardinality == 0 and c.names['.'] != '_id' for c in abs_columns):
            Log.warning(
                "Some columns are not stored {{names}}",
                names=[
                    ".".join((c.es_index, c.names['.']))
                    for c in abs_columns
                    if c.cardinality == 0
                ]
            )

        with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(SELF_PATH)
            query_paths.append(ROOT_PATH)
            self.alias_to_query_paths[alias] = query_paths
            for i in self.index_to_alias.get_domain(alias):
                self.alias_to_query_paths[i] = query_paths

            # ADD RELATIVE NAMES
            for abs_column in abs_columns:
                abs_column.last_updated = None
                abs_column.jx_type = jx_type(abs_column)
                for query_path in query_paths:
                    abs_column.names[query_path[0]] = relative_field(abs_column.names["."], query_path[0])
                self.todo.add(self.meta.columns.add(abs_column))
        pass

    def query(self, _query):
        return self.meta.columns.query(QueryOp(set_default(
            {
                "from": self.meta.columns,
                "sort": ["table", "name"]
            },
            _query.__data__()
        )))

    def _find_alias(self, name):
        if self.metadata_last_updated < self.es_cluster.metatdata_last_updated:
            for a in self.es_cluster.get_aliases():
                self.index_to_alias[a.index] = coalesce(a.alias, a.index)
                self.alias_last_updated.setdefault(a.alias, Date.MIN)
        if name in self.alias_last_updated:
            return name
        else:
            return self.index_to_alias[name]

    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        root_table_name = table_path[0]

        alias = self._find_alias(root_table_name)
        if not alias:
            self.es_cluster.get_metadata(force=True)
            alias = self._find_alias(root_table_name)
            if not alias:
                Log.error("{{table|quote}} does not exist", table=table_name)

        try:
            last_update = MAX([
                self.es_cluster.index_last_updated[i]
                for i in self.index_to_alias.get_domain(alias)
            ])

            table = self.get_table(alias)[0]
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = TableDesc(
                    name=alias,
                    url=None,
                    query_path=['.'],
                    timestamp=Date.MIN
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._reload_columns(table)
            elif force or table.timestamp < last_update:
                self._reload_columns(table)

            columns = self.meta.columns.find(alias, column_name)
            columns = jx.sort(columns, "names.\\.")
            # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
            while len(self.todo) and not all(columns.get("last_updated")):
                if DEBUG:
                    if len(columns) > 10:
                        Log.note("waiting for {{num}} columns to update", num=len([c for c in columns if not c.last_updated]))
                    else:
                        Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated])
                Till(seconds=1).wait()
            return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        return []

    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if column.es_index in self.index_does_not_exist:
            return

        if column.jx_type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.columns),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            if column.es_index == "meta.tables":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.tables),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": {"filter": {"match_all": {}}}
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = max(1001, count)
                multi = 1001
            elif column.es_column == "_id":
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "query": {"match_all": {}},
                    "size": 0
                })
                count = cardinality = result.hits.total
                multi = 1
            elif column.es_type == BOOLEAN:
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": _counting_query(column)
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = 2
                multi = 1
            else:
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": _counting_query(column),
                        "multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}}
                    },
                    "size": 0
                })
                agg_results = result.aggregations
                count = result.hits.total
                cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count)
                multi = int(coalesce(agg_results.multi.value, 1))
                if cardinality == None:
                   Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                self.meta.columns.update({
                    "set": {
                        "count": cardinality,
                        "cardinality": cardinality,
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {"path": column.nested_path[0]},
                    "aggs": {"_nested": {"terms": {"field": column.es_column}}}
                }
            elif cardinality == 0:
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}}

            result = self.es_cluster.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            self.meta.columns.update({
                "set": {
                    "count": count,
                    "cardinality": cardinality,
                    "multi": multi,
                    "partitions": parts,
                    "last_updated": Date.now()
                },
                "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
            })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            e = Except.wrap(e)
            TEST_TABLE = "testdata"
            is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"])
            is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE))
            if is_missing_index and is_test_table:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                self.meta.columns.update({
                    "clear": ".",
                    "where": {"eq": {"es_index": column.es_index}}
                })
                self.index_does_not_exist.add(column.es_index)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)

    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    old_columns = [
                        c
                        for c in self.meta.columns
                        if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.jx_type not in STRUCT
                    ]
                    if old_columns:
                        DEBUG and Log.note(
                            "Old columns {{names|json}} last updated {{dates|json}}",
                            names=wrap(old_columns).es_column,
                            dates=[Date(t).format() for t in wrap(old_columns).last_updated]
                        )
                        self.todo.extend(old_columns)
                        # TEST CONSISTENCY
                        for c, d in product(list(self.todo.queue), list(self.todo.queue)):
                            if c.es_column == d.es_column and c.es_index == d.es_index and c != d:
                                Log.error("")
                    else:
                        DEBUG and Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10*MINUTE).seconds))
                if column:
                    if column is THREAD_STOP:
                        continue

                    with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG):
                        if column.es_index in self.index_does_not_exist:
                            self.meta.columns.update({
                                "clear": ".",
                                "where": {"eq": {"es_index": column.es_index}}
                            })
                            continue
                        if column.jx_type in STRUCT or column.es_column.endswith("." + EXISTS_TYPE):
                            column.last_updated = Date.now()
                            continue
                        elif column.last_updated >= Date.now()-TOO_OLD:
                            continue
                        try:
                            self._update_cardinality(column)
                            (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column)
                        except Exception as e:
                            if '"status":404' in e:
                                self.meta.columns.update({
                                    "clear": ".",
                                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                                })
                            else:
                                Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)

    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == THREAD_STOP:
                break

            if c.last_updated >= Date.now()-TOO_OLD:
                continue

            with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": c}, silent=not DEBUG, too_long=0.05):
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })

    def get_table(self, name):
        if name == "meta.columns":
            return self.meta.columns

            # return self.meta.columns
        with self.meta.tables.locker:
            return wrap([t for t in self.meta.tables.data if t.name == name])

    def get_snowflake(self, fact_table_name):
        return Snowflake(fact_table_name, self)

    def get_schema(self, name):
        if name == "meta.columns":
            return self.meta.columns.schema
        query_path = split_field(name)
        root, rest = query_path[0], join_field(query_path[1:])
        return self.get_snowflake(root).get_schema(rest)
Exemplo n.º 53
0
class StructuredLogger_usingElasticSearch(StructuredLogger):
    @override
    def __init__(
        self,
        host,
        index,
        port=9200,
        type="log",
        queue_size=1000,
        batch_size=100,
        kwargs=None,
    ):
        """
        settings ARE FOR THE ELASTICSEARCH INDEX
        """
        kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds
        kwargs.retry.times = coalesce(kwargs.retry.times, 3)
        kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds
        kwargs.host = Random.sample(listwrap(host), 1)[0]

        schema = json2value(value2json(SCHEMA), leaves=True)
        schema.mappings[type].properties["~N~"].type = "nested"
        self.es = Cluster(kwargs).get_or_create_index(
            schema=schema,
            limit_replicas=True,
            typed=True,
            kwargs=kwargs,
        )
        self.batch_size = batch_size
        self.es.add_alias(coalesce(kwargs.alias, kwargs.index))
        self.queue = Queue("debug logs to es", max=queue_size, silent=True)

        self.worker = Thread.run("add debug logs to es", self._insert_loop)

    def write(self, template, params):
        try:
            params.template = strings.limit(params.template, 2000)
            params.format = None
            self.queue.add({"value": _deep_json_to_string(params, 3)}, timeout=3 * 60)
        except Exception as e:
            sys.stdout.write(text_type(Except.wrap(e)))
        return self

    def _insert_loop(self, please_stop=None):
        bad_count = 0
        while not please_stop:
            try:
                messages = wrap(self.queue.pop_all())
                if not messages:
                    Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait()
                    continue

                for g, mm in jx.groupby(messages, size=self.batch_size):
                    scrubbed = []
                    for i, message in enumerate(mm):
                        if message is THREAD_STOP:
                            please_stop.go()
                            continue
                        try:
                            messages = flatten_causal_chain(message.value)
                            scrubbed.append(
                                {
                                    "value": [
                                        _deep_json_to_string(m, depth=3)
                                        for m in messages
                                    ]
                                }
                            )
                        except Exception as e:
                            Log.warning("Problem adding to scrubbed list", cause=e)

                    self.es.extend(scrubbed)
                    bad_count = 0
            except Exception as f:
                Log.warning("Problem inserting logs into ES", cause=f)
                bad_count += 1
                if bad_count > MAX_BAD_COUNT:
                    Log.warning(
                        "Given up trying to write debug logs to ES index {{index}}",
                        index=self.es.settings.index,
                    )
                Till(seconds=PAUSE_AFTER_BAD_INSERT).wait()

        self.es.flush()

        # CONTINUE TO DRAIN THIS QUEUE
        while not please_stop:
            try:
                Till(seconds=PAUSE_AFTER_GOOD_INSERT).wait()
                self.queue.pop_all()
            except Exception as e:
                Log.warning("Should not happen", cause=e)

    def stop(self):
        with suppress_exception:
            self.queue.add(THREAD_STOP)  # BE PATIENT, LET REST OF MESSAGE BE SENT

        with suppress_exception:
            self.queue.close()
        self.worker.join()