예제 #1
0
class StructuredLogger_usingQueue(StructuredLogger):
    def __init__(self, name=None):
        queue_name = "log messages to queue"
        if name:
            queue_name += " " + name
        self.queue = Queue(queue_name)

    def write(self, template, params):
        self.queue.add(expand_template(template, params))

    def stop(self):
        self.queue.close()

    def pop(self):
        lines = self.queue.pop()
        output = []
        for l in lines.split(CR):
            # REMOVE FIRST PART, THE TIMESTAMP
            # 0123456789012345678901234567890
            # 2019-01-06 19:13:49.937542 -
            prefix = re.match(DATE_PATTERN, l)
            if prefix:
                l = l[len(prefix.group(0)):]
            if not l.strip():
                continue
            if l.strip().startswith("File"):
                continue
            output.append(l)
        return CR.join(output).strip()
예제 #2
0
class StructuredLogger_usingQueue(StructuredLogger):
    def __init__(self, name=None):
        queue_name = "log messages to queue"
        if name:
            queue_name += " " + name
        self.queue = Queue(queue_name)

    def write(self, template, params):
        self.queue.add(expand_template(template, params))

    def stop(self):
        self.queue.close()

    def pop(self):
        lines = self.queue.pop()
        output = []
        for l in lines.split("\n"):
            if l[19:22] == " - ":
                l = l[22:]
            if l.strip().startswith("File"):
                continue
            output.append(l)
        return "\n".join(output).strip()
예제 #3
0
class StructuredLogger_usingQueue(StructuredLogger):

    def __init__(self, name=None):
        queue_name = "log messages to queue"
        if name:
            queue_name += " "+name
        self.queue = Queue(queue_name)

    def write(self, template, params):
        self.queue.add(expand_template(template, params))

    def stop(self):
        self.queue.close()

    def pop(self):
        lines = self.queue.pop()
        output = []
        for l in lines.split("\n"):
            if l[19:22] == " - ":
                l = l[22:]
            if l.strip().startswith("File"):
                continue
            output.append(l)
        return "\n".join(output).strip()
class ColumnList(Table, Container):
    """
    CENTRAL CONTAINER FOR ALL COLUMNS
    SYNCHRONIZED WITH ELASTICSEARCH
    OPTIMIZED FOR THE PARTICULAR ACCESS PATTERNS USED
    """
    def __init__(self, es_cluster):
        Table.__init__(self, META_COLUMNS_NAME)
        self.data = {}  # MAP FROM ES_INDEX TO (abs_column_name to COLUMNS)
        self.locker = Lock()
        self._schema = None
        self.dirty = False
        self.es_cluster = es_cluster
        self.es_index = None
        self.last_load = Null
        self.for_es_update = Queue(
            "update columns to es"
        )  # HOLD (action, column) PAIR, WHERE action in ['insert', 'update']
        self._db_load()
        self.delete_queue = Queue(
            "delete columns from es")  # CONTAINS (es_index, after) PAIRS
        Thread.run("update " + META_COLUMNS_NAME,
                   self._update_from_es,
                   parent_thread=MAIN_THREAD).release()
        Thread.run("delete columns",
                   self._delete_columns,
                   parent_thread=MAIN_THREAD).release()

    def _query(self, query):
        result = Data()
        curr = self.es_cluster.execute(query)
        result.meta.format = "table"
        result.header = [d[0] for d in curr.description
                         ] if curr.description else None
        result.data = curr.fetchall()
        return result

    def _db_create(self):
        schema = {
            "settings": {
                "index.number_of_shards": 1,
                "index.number_of_replicas": REPLICAS
            },
            "mappings": {
                META_COLUMNS_TYPE_NAME: {}
            },
        }

        self.es_index = self.es_cluster.create_index(id=ID,
                                                     index=META_COLUMNS_NAME,
                                                     schema=schema)
        self.es_index.add_alias(META_COLUMNS_NAME)

        for c in META_COLUMNS_DESC.columns:
            self._add(c)
            self.es_index.add({"value": c.__dict__()})

    def _db_load(self):
        self.last_load = Date.now()

        try:
            self.es_index = self.es_cluster.get_index(
                id=ID,
                index=META_COLUMNS_NAME,
                type=META_COLUMNS_TYPE_NAME,
                read_only=False)

            result = self.es_index.search({
                "query": {
                    "bool": {
                        "should": [
                            {
                                "bool": {
                                    "must_not": {
                                        "exists": {
                                            "field": "cardinality.~n~"
                                        }
                                    }
                                }
                            },
                            {  # ASSUME UNUSED COLUMNS DO NOT EXIST
                                "range": {
                                    "cardinality.~n~": {
                                        "gte": 0
                                    }
                                }
                            },
                        ]
                    }
                },
                "sort": ["es_index.~s~", "name.~s~", "es_column.~s~"],
                "size":
                10000,
            })

            with Timer("adding columns to structure"):
                for r in result.hits.hits._source:
                    col = doc_to_column(r)
                    if col:
                        self._add(col)

            Log.note("{{num}} columns loaded", num=result.hits.total)
            if not self.data.get(META_COLUMNS_NAME):
                Log.error("metadata missing from index!")

        except Exception as e:
            metadata = self.es_cluster.get_metadata(after=Date.now())
            if any(
                    index.startswith(META_COLUMNS_NAME)
                    for index in metadata.indices.keys()):
                Log.error("metadata already exists!", cause=e)

            Log.warning("no {{index}} exists, making one",
                        index=META_COLUMNS_NAME,
                        cause=e)
            self._db_create()

    def delete_from_es(self, es_index, after):
        """
        DELETE COLUMNS STORED IN THE ES INDEX
        :param es_index:
        :param after: ONLY DELETE RECORDS BEFORE THIS TIME
        :return:
        """
        self.delete_queue.add((es_index, after))

    def _delete_columns(self, please_stop):
        while not please_stop:
            result = self.delete_queue.pop(till=please_stop)
            if result == THREAD_STOP:
                break
            more_result = self.delete_queue.pop_all()
            results = [result] + more_result
            try:
                delete_result = self.es_index.delete_record({
                    "bool": {
                        "should": [{
                            "bool": {
                                "must": [{
                                    "term": {
                                        "es_index.~s~": es_index
                                    }
                                }, {
                                    "range": {
                                        "last_updated.~n~": {
                                            "lte": after.unix
                                        }
                                    }
                                }]
                            }
                        } for es_index, after in results]
                    }
                })

                if DEBUG:
                    query = {
                        "query": {
                            "terms": {
                                "es_index.~s~":
                                [es_index for es_index, after in results]
                            }
                        }
                    }
                    verify = self.es_index.search(query)
                    while verify.hits.total:
                        Log.note("wait for columns to be gone")
                        verify = self.es_index.search(query)

                    Log.note(
                        "Deleted {{delete_result}} columns from {{table}}",
                        table=[es_index for es_index, after in results],
                        delete_result=delete_result.deleted)
            except Exception as cause:
                Log.warning("Problem with delete of table", cause=cause)
            Till(seconds=1).wait()

    def _update_from_es(self, please_stop):
        try:
            last_extract = Date.now()
            while not please_stop:
                now = Date.now()
                try:
                    if (now - last_extract).seconds > COLUMN_EXTRACT_PERIOD:
                        result = self.es_index.search({
                            "query": {
                                "range": {
                                    "last_updated.~n~": {
                                        "gte": self.last_load
                                    }
                                }
                            },
                            "sort":
                            ["es_index.~s~", "name.~s~", "es_column.~s~"],
                            "from":
                            0,
                            "size":
                            10000,
                        })
                        last_extract = now

                        with self.locker:
                            for r in result.hits.hits._source:
                                c = doc_to_column(r)
                                if c:
                                    self._add(c)
                                    self.last_load = MAX(
                                        (self.last_load, c.last_updated))

                    while not please_stop:
                        updates = self.for_es_update.pop_all()
                        if not updates:
                            break

                        DEBUG and updates and Log.note(
                            "{{num}} columns to push to db", num=len(updates))
                        self.es_index.extend([{
                            "value": column.__dict__()
                        } for column in updates])
                except Exception as e:
                    Log.warning("problem updating database", cause=e)

                (Till(seconds=COLUMN_LOAD_PERIOD) | please_stop).wait()
        finally:
            Log.note("done")

    def find(self, es_index, abs_column_name=None):
        with self.locker:
            if es_index.startswith("meta."):
                self._update_meta()

            if not abs_column_name:
                return [
                    c for cs in self.data.get(es_index, {}).values()
                    for c in cs
                ]
            else:
                return self.data.get(es_index, {}).get(abs_column_name, [])

    def extend(self, columns):
        self.dirty = True
        with self.locker:
            for column in columns:
                self._add(column)

    def add(self, column):
        self.dirty = True
        with self.locker:
            canonical = self._add(column)
        if canonical == None:
            return column  # ALREADY ADDED
        self.for_es_update.add(canonical)
        return canonical

    def remove(self, column, after):
        if column.last_updated > after:
            return
        mark_as_deleted(column, after)
        with self.locker:
            canonical = self._add(column)
        if canonical:
            Log.error("Expecting canonical column to be removed")
        DEBUG and Log.note("delete {{col|quote}}, at {{timestamp}}",
                           col=column.es_column,
                           timestamp=column.last_updated)
        self.for_es_update.add(column)

    def remove_table(self, table_name):
        del self.data[table_name]

    def _add(self, column):
        """
        :param column: ANY COLUMN OBJECT
        :return:  None IF column IS canonical ALREADY (NET-ZERO EFFECT)
        """
        if not isinstance(column, Column):
            Log.warning("expecting a column not {{column|json}}",
                        column=column)
            return
        columns_for_table = self.data.setdefault(column.es_index, {})
        existing_columns = columns_for_table.setdefault(column.name, [])

        for canonical in existing_columns:
            if canonical is column:
                return None
            if canonical.es_type == column.es_type:
                if column.last_updated > canonical.last_updated:
                    for key in Column.__slots__:
                        old_value = canonical[key]
                        new_value = column[key]
                        if new_value == old_value:
                            pass  # NO NEED TO UPDATE WHEN NO CHANGE MADE (COMMON CASE)
                        else:
                            canonical[key] = new_value
                return canonical
        existing_columns.append(column)
        return column

    def _update_meta(self):
        if not self.dirty:
            return

        now = Date.now()
        for mc in META_COLUMNS_DESC.columns:
            count = 0
            values = set()
            objects = 0
            multi = 1
            for column in self._all_columns():
                value = column[mc.name]
                if value == None:
                    pass
                else:
                    count += 1
                    if is_list(value):
                        multi = max(multi, len(value))
                        try:
                            values |= set(value)
                        except Exception:
                            objects += len(value)
                    elif is_data(value):
                        objects += 1
                    else:
                        values.add(value)
            mc.count = count
            mc.cardinality = len(values) + objects
            mc.partitions = jx.sort(values)
            mc.multi = multi
            mc.last_updated = now

        META_COLUMNS_DESC.last_updated = now
        self.dirty = False

    def _all_columns(self):
        return [
            column for t, cs in self.data.items() for _, css in cs.items()
            for column in css
        ]

    def __iter__(self):
        with self.locker:
            self._update_meta()
            return iter(self._all_columns())

    def __len__(self):
        return self.data[META_COLUMNS_NAME]["es_index"].count

    def clear(self, es_index, es_column=None, after=None):
        if es_column:
            for c in self.data.get(es_index, {}).get(es_column, []):
                self.remove(c, after=after)
            return

        data = self.data
        with self.locker:
            cols = data.get(es_index)
            if not cols:
                return
            del data[es_index]

        for c in cols.values():
            for cc in c:
                mark_as_deleted(cc, after=after)

    def update(self, command):
        self.dirty = True
        try:
            command = to_data(command)
            DEBUG and Log.note(
                "Update {{timestamp}}: {{command|json}}",
                command=command,
                timestamp=Date(command["set"].last_updated),
            )
            eq = command.where.eq
            if eq.es_index:
                if len(eq) == 1:
                    if unwraplist(command.clear) == ".":
                        d = self.data
                        i = eq.es_index
                        with self.locker:
                            cols = d[i]
                            del d[i]

                        for c in cols:
                            self.remove(c)
                        return

                    # FASTEST
                    all_columns = self.data.get(eq.es_index, {}).values()
                    with self.locker:
                        columns = [c for cs in all_columns for c in cs]
                elif eq.es_column and len(eq) == 2:
                    # FASTER
                    all_columns = self.data.get(eq.es_index, {}).values()
                    with self.locker:
                        columns = [
                            c for cs in all_columns for c in cs
                            if c.es_column == eq.es_column
                        ]

                else:
                    # SLOWER
                    all_columns = self.data.get(eq.es_index, {}).values()
                    with self.locker:
                        columns = [
                            c for cs in all_columns for c in cs
                            if all(c[k] == v for k, v in
                                   eq.items())  # THIS LINE IS VERY SLOW
                        ]
            else:
                columns = list(self)
                columns = jx.filter(columns, command.where)

            with self.locker:
                for col in columns:
                    DEBUG and Log.note(
                        "update column {{table}}.{{column}}",
                        table=col.es_index,
                        column=col.es_column,
                    )
                    for k in command["clear"]:
                        if k == ".":
                            mark_as_deleted(col, Date.now())
                            self.for_es_update.add(col)
                            lst = self.data[col.es_index]
                            cols = lst[col.name]
                            cols.remove(col)
                            if len(cols) == 0:
                                del lst[col.name]
                                if len(lst) == 0:
                                    del self.data[col.es_index]
                            break
                        else:
                            col[k] = None
                    else:
                        # DID NOT DELETE COLUMNM ("."), CONTINUE TO SET PROPERTIES
                        for k, v in command.set.items():
                            col[k] = v
                        self.for_es_update.add(col)

        except Exception as e:
            Log.error("should not happen", cause=e)

    def query(self, query):
        # NOT EXPECTED TO BE RUN
        Log.error("not")
        with self.locker:
            self._update_meta()
            if not self._schema:
                self._schema = Schema(".", [
                    c for cs in self.data[META_COLUMNS_NAME].values()
                    for c in cs
                ])
            snapshot = self._all_columns()

        from jx_python.containers.list import ListContainer

        query.frum = ListContainer(META_COLUMNS_NAME, snapshot, self._schema)
        return jx.run(query)

    def groupby(self, keys):
        with self.locker:
            self._update_meta()
            return jx.groupby(self.__iter__(), keys)

    def window(self, window):
        raise NotImplemented()

    @property
    def schema(self):
        if not self._schema:
            with self.locker:
                self._update_meta()
                self._schema = Schema(".", [
                    c for cs in self.data[META_COLUMNS_NAME].values()
                    for c in cs
                ])
        return self._schema

    @property
    def namespace(self):
        return self

    def get_table(self, table_name):
        if table_name != META_COLUMNS_NAME:
            Log.error("this container has only the " + META_COLUMNS_NAME)
        return self

    def get_columns(self, table_name):
        if table_name != META_COLUMNS_NAME:
            Log.error("this container has only the " + META_COLUMNS_NAME)
        return self._all_columns()

    def denormalized(self):
        """
        THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM
        THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES
        """
        with self.locker:
            self._update_meta()
            output = [
                {
                    "table": c.es_index,
                    "name": untype_path(c.name),
                    "cardinality": c.cardinality,
                    "es_column": c.es_column,
                    "es_index": c.es_index,
                    "last_updated": c.last_updated,
                    "count": c.count,
                    "nested_path": [unnest_path(n) for n in c.nested_path],
                    "es_type": c.es_type,
                    "type": c.jx_type,
                } for tname, css in self.data.items()
                for cname, cs in css.items() for c in cs
                if c.jx_type not in INTERNAL  # and c.es_column != "_id"
            ]

        from jx_python.containers.list import ListContainer

        return ListContainer(
            self.name,
            data=output,
            schema=BaseSchema(META_COLUMNS_NAME, SIMPLE_METADATA_COLUMNS),
        )
예제 #5
0
class Sqlite(DB):
    """
    Allows multi-threaded access
    Loads extension functions (like SQRT)
    """

    canonical = None

    def __init__(self, filename=None, db=None, upgrade=True):
        """
        :param db:  Optional, wrap a sqlite db in a thread
        :return: Multithread-safe database
        """
        if upgrade and not _upgraded:
            _upgrade()

        self.filename = filename
        self.db = db
        self.queue = Queue(
            "sql commands")  # HOLD (command, result, signal) PAIRS
        self.worker = Thread.run("sqlite db thread", self._worker)
        self.get_trace = DEBUG
        self.upgrade = upgrade

    def _enhancements(self):
        def regex(pattern, value):
            return 1 if re.match(pattern + "$", value) else 0

        con = self.db.create_function("regex", 2, regex)

        class Percentile(object):
            def __init__(self, percentile):
                self.percentile = percentile
                self.acc = []

            def step(self, value):
                self.acc.append(value)

            def finalize(self):
                return percentile(self.acc, self.percentile)

        con.create_aggregate("percentile", 2, Percentile)

    def execute(self, command):
        """
        COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN
        BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS
        :param command: COMMAND FOR SQLITE
        :return: None
        """
        if DEBUG:  # EXECUTE IMMEDIATELY FOR BETTER STACK TRACE
            return self.query(command)

        if self.get_trace:
            trace = extract_stack(1)
        else:
            trace = None
        self.queue.add((command, None, None, trace))

    def query(self, command):
        """
        WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED
        :param command: COMMAND FOR SQLITE
        :return: list OF RESULTS
        """
        if not self.worker:
            self.worker = Thread.run("sqlite db thread", self._worker)

        signal = Signal()
        result = Data()
        self.queue.add((command, result, signal, None))
        signal.wait()
        if result.exception:
            Log.error("Problem with Sqlite call", cause=result.exception)
        return result

    def _worker(self, please_stop):
        global _load_extension_warning_sent

        if DEBUG:
            Log.note("Sqlite version {{version}}",
                     version=sqlite3.sqlite_version)
        if Sqlite.canonical:
            self.db = Sqlite.canonical
        else:
            self.db = sqlite3.connect(coalesce(self.filename, ':memory:'))

            library_loc = File.new_instance(sys.modules[__name__].__file__,
                                            "../..")
            full_path = File.new_instance(
                library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath
            try:
                trace = extract_stack(0)[0]
                if self.upgrade:
                    if os.name == 'nt':
                        file = File.new_instance(
                            trace["file"],
                            "../../vendor/sqlite/libsqlitefunctions.so")
                    else:
                        file = File.new_instance(
                            trace["file"],
                            "../../vendor/sqlite/libsqlitefunctions")

                    full_path = file.abspath
                    self.db.enable_load_extension(True)
                    self.db.execute("SELECT load_extension(" +
                                    self.quote_value(full_path) + ")")
            except Exception as e:
                if not _load_extension_warning_sent:
                    _load_extension_warning_sent = True
                    Log.warning(
                        "Could not load {{file}}}, doing without. (no SQRT for you!)",
                        file=full_path,
                        cause=e)

        try:
            while not please_stop:
                command, result, signal, trace = self.queue.pop(
                    till=please_stop)

                if DEBUG_INSERT and command.strip().lower().startswith(
                        "insert"):
                    Log.note("Running command\n{{command|indent}}",
                             command=command)
                if DEBUG and not command.strip().lower().startswith("insert"):
                    Log.note("Running command\n{{command|indent}}",
                             command=command)
                with Timer("Run command", debug=DEBUG):
                    if signal is not None:
                        try:
                            curr = self.db.execute(command)
                            self.db.commit()
                            result.meta.format = "table"
                            result.header = [d[0] for d in curr.description
                                             ] if curr.description else None
                            result.data = curr.fetchall()
                            if DEBUG and result.data:
                                text = convert.table2csv(list(result.data))
                                Log.note("Result:\n{{data}}", data=text)
                        except Exception as e:
                            e = Except.wrap(e)
                            result.exception = Except(
                                ERROR,
                                "Problem with\n{{command|indent}}",
                                command=command,
                                cause=e)
                        finally:
                            signal.go()
                    else:
                        try:
                            self.db.execute(command)
                            self.db.commit()
                        except Exception as e:
                            e = Except.wrap(e)
                            e.cause = Except(type=ERROR,
                                             template="Bad call to Sqlite",
                                             trace=trace)
                            Log.warning("Failure to execute", cause=e)

        except Exception as e:
            if not please_stop:
                Log.error("Problem with sql thread", e)
        finally:
            if DEBUG:
                Log.note("Database is closed")
            self.db.commit()
            self.db.close()

    def quote_column(self, column_name, table=None):
        return quote_column(column_name, table)

    def quote_value(self, value):
        return quote_value(value)
예제 #6
0
파일: sqlite.py 프로젝트: gmierz/coco-tools
class Sqlite(DB):
    """
    Allows multi-threaded access
    Loads extension functions (like SQRT)
    """

    canonical = None

    @override
    def __init__(self, filename=None, db=None, upgrade=True, load_functions=False, kwargs=None):
        """
        :param db:  Optional, wrap a sqlite db in a thread
        :return: Multithread-safe database
        """
        if upgrade and not _upgraded:
            _upgrade()

        self.settings = kwargs
        self.filename = File(filename).abspath
        self.db = db
        self.queue = Queue("sql commands")   # HOLD (command, result, signal) PAIRS
        self.worker = Thread.run("sqlite db thread", self._worker)
        self.get_trace = TRACE
        self.upgrade = upgrade
        self.closed = False
        if DEBUG:
            Log.note("Sqlite version {{version}}", version=self.query("select sqlite_version()").data[0][0])

    def _enhancements(self):
        def regex(pattern, value):
            return 1 if re.match(pattern+"$", value) else 0
        con = self.db.create_function("regex", 2, regex)

        class Percentile(object):
            def __init__(self, percentile):
                self.percentile=percentile
                self.acc=[]

            def step(self, value):
                self.acc.append(value)

            def finalize(self):
                return percentile(self.acc, self.percentile)

        con.create_aggregate("percentile", 2, Percentile)

    def execute(self, command):
        """
        COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN
        BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS
        :param command: COMMAND FOR SQLITE
        :return: Signal FOR IF YOU WANT TO BE NOTIFIED WHEN DONE
        """
        if self.closed:
            Log.error("database is closed")
        if DEBUG_EXECUTE:  # EXECUTE IMMEDIATELY FOR BETTER STACK TRACE
            self.query(command)
            return DONE

        if self.get_trace:
            trace = extract_stack(1)
        else:
            trace = None

        is_done = Signal()
        self.queue.add((command, None, is_done, trace))
        return is_done

    def commit(self):
        """
        WILL BLOCK CALLING THREAD UNTIL ALL PREVIOUS execute() CALLS ARE COMPLETED
        :return:
        """
        if self.closed:
            Log.error("database is closed")
        signal = _allocate_lock()
        signal.acquire()
        self.queue.add((COMMIT, None, signal, None))
        signal.acquire()
        return

    def query(self, command):
        """
        WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED
        :param command: COMMAND FOR SQLITE
        :return: list OF RESULTS
        """
        if self.closed:
            Log.error("database is closed")
        if not self.worker:
            self.worker = Thread.run("sqlite db thread", self._worker)

        signal = _allocate_lock()
        signal.acquire()
        result = Data()
        self.queue.add((command, result, signal, None))
        signal.acquire()
        if result.exception:
            Log.error("Problem with Sqlite call", cause=result.exception)
        return result

    def close(self):
        """
        OPTIONAL COMMIT-AND-CLOSE
        IF THIS IS NOT DONE, THEN THE THREAD THAT SPAWNED THIS INSTANCE
        :return:
        """
        self.closed = True
        signal = _allocate_lock()
        signal.acquire()
        self.queue.add((COMMIT, None, signal, None))
        signal.acquire()
        self.worker.please_stop.go()
        return

    def __enter__(self):
        pass

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def _worker(self, please_stop):
        global _load_extension_warning_sent

        try:
            if DEBUG:
                Log.note("Sqlite version {{version}}", version=sqlite3.sqlite_version)
            try:
                if Sqlite.canonical:
                    self.db = Sqlite.canonical
                else:
                    self.db = sqlite3.connect(coalesce(self.filename, ':memory:'), check_same_thread=False)
            except Exception as e:
                Log.error("could not open file {{filename}}", filename=self.filename)

            if self.settings.load_functions:
                self._load_functions()

            while not please_stop:
                quad = self.queue.pop(till=please_stop)
                if quad is None:
                    break
                command, result, signal, trace = quad

                show_timing = False
                if DEBUG_INSERT and command.strip().lower().startswith("insert"):
                    Log.note("Running command\n{{command|limit(100)|indent}}", command=command)
                    show_timing = True
                if DEBUG and not command.strip().lower().startswith("insert"):
                    Log.note("Running command\n{{command|limit(100)|indent}}", command=command)
                    show_timing = True
                with Timer("SQL Timing", silent=not show_timing):
                    if command is COMMIT:
                        self.db.commit()
                        signal.release()
                    elif signal is not None:
                        try:
                            curr = self.db.execute(command)
                            if result is not None:
                                result.meta.format = "table"
                                result.header = [d[0] for d in curr.description] if curr.description else None
                                result.data = curr.fetchall()
                                if DEBUG and result.data:
                                    text = convert.table2csv(list(result.data))
                                    Log.note("Result:\n{{data}}", data=text)
                        except Exception as e:
                            e = Except.wrap(e)
                            e.cause = Except(
                                type=ERROR,
                                template="Bad call to Sqlite",
                                trace=trace
                            )
                            if result is None:
                                Log.error("Problem with\n{{command|indent}}", command=command, cause=e)
                            else:
                                result.exception = Except(ERROR, "Problem with\n{{command|indent}}", command=command, cause=e)
                        finally:
                            if isinstance(signal, Signal):
                                signal.go()
                            else:
                                signal.release()
                    else:
                        try:
                            self.db.execute(command)
                        except Exception as e:
                            e = Except.wrap(e)
                            e.cause = Except(
                                type=ERROR,
                                template="Bad call to Sqlite",
                                trace=trace
                            )
                            Log.warning("Failure to execute", cause=e)

        except Exception as e:
            if not please_stop:
                Log.warning("Problem with sql thread", cause=e)
        finally:
            self.closed = True
            if DEBUG:
                Log.note("Database is closed")
            self.db.close()

    def _load_functions(self):
        global _load_extension_warning_sent
        library_loc = File.new_instance(sys.modules[__name__].__file__, "../..")
        full_path = File.new_instance(library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath
        try:
            trace = extract_stack(0)[0]
            if self.upgrade:
                if os.name == 'nt':
                    file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions.so")
                else:
                    file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions")

                full_path = file.abspath
                self.db.enable_load_extension(True)
                self.db.execute(SQL_SELECT + "load_extension" + sql_iso(quote_value(full_path)))
        except Exception as e:
            if not _load_extension_warning_sent:
                _load_extension_warning_sent = True
                Log.warning("Could not load {{file}}, doing without. (no SQRT for you!)", file=full_path, cause=e)

    def create_new_functions(self):

        def regexp(pattern, item):
            reg = re.compile(pattern)
            return reg.search(item) is not None

        self.db.create_function("REGEXP", 2, regexp)
예제 #7
0
class Sqlite(DB):
    """
    Allows multi-threaded access
    Loads extension functions (like SQRT)
    """

    canonical = None

    def __init__(self, filename=None, db=None, upgrade=True):
        """
        :param db:  Optional, wrap a sqlite db in a thread
        :return: Multithread-safe database
        """
        if upgrade and not _upgraded:
            _upgrade()

        self.filename = filename
        self.db = db
        self.queue = Queue("sql commands")   # HOLD (command, result, signal) PAIRS
        self.worker = Thread.run("sqlite db thread", self._worker)
        self.get_trace = DEBUG
        self.upgrade = upgrade

    def _enhancements(self):
        def regex(pattern, value):
            return 1 if re.match(pattern+"$", value) else 0
        con = self.db.create_function("regex", 2, regex)

        class Percentile(object):
            def __init__(self, percentile):
                self.percentile=percentile
                self.acc=[]

            def step(self, value):
                self.acc.append(value)

            def finalize(self):
                return percentile(self.acc, self.percentile)

        con.create_aggregate("percentile", 2, Percentile)

    def execute(self, command):
        """
        COMMANDS WILL BE EXECUTED IN THE ORDER THEY ARE GIVEN
        BUT CAN INTERLEAVE WITH OTHER TREAD COMMANDS
        :param command: COMMAND FOR SQLITE
        :return: None
        """
        if DEBUG:  # EXECUTE IMMEDIATELY FOR BETTER STACK TRACE
            return self.query(command)

        if self.get_trace:
            trace = extract_stack(1)
        else:
            trace = None
        self.queue.add((command, None, None, trace))

    def query(self, command):
        """
        WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED
        :param command: COMMAND FOR SQLITE
        :return: list OF RESULTS
        """
        if not self.worker:
            self.worker = Thread.run("sqlite db thread", self._worker)

        signal = Signal()
        result = Data()
        self.queue.add((command, result, signal, None))
        signal.wait()
        if result.exception:
            Log.error("Problem with Sqlite call", cause=result.exception)
        return result

    def _worker(self, please_stop):
        global _load_extension_warning_sent

        if DEBUG:
            Log.note("Sqlite version {{version}}", version=sqlite3.sqlite_version)
        if Sqlite.canonical:
            self.db = Sqlite.canonical
        else:
            self.db = sqlite3.connect(coalesce(self.filename, ':memory:'))

            library_loc = File.new_instance(sys.modules[__name__].__file__, "../..")
            full_path = File.new_instance(library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath
            try:
                trace = extract_stack(0)[0]
                if self.upgrade:
                    if os.name == 'nt':
                        file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions.so")
                    else:
                        file = File.new_instance(trace["file"], "../../vendor/sqlite/libsqlitefunctions")

                    full_path = file.abspath
                    self.db.enable_load_extension(True)
                    self.db.execute("SELECT load_extension(" + self.quote_value(full_path) + ")")
            except Exception as e:
                if not _load_extension_warning_sent:
                    _load_extension_warning_sent = True
                    Log.warning("Could not load {{file}}}, doing without. (no SQRT for you!)", file=full_path, cause=e)

        try:
            while not please_stop:
                command, result, signal, trace = self.queue.pop(till=please_stop)

                if DEBUG_INSERT and command.strip().lower().startswith("insert"):
                    Log.note("Running command\n{{command|indent}}", command=command)
                if DEBUG and not command.strip().lower().startswith("insert"):
                    Log.note("Running command\n{{command|indent}}", command=command)
                with Timer("Run command", debug=DEBUG):
                    if signal is not None:
                        try:
                            curr = self.db.execute(command)
                            self.db.commit()
                            result.meta.format = "table"
                            result.header = [d[0] for d in curr.description] if curr.description else None
                            result.data = curr.fetchall()
                            if DEBUG and result.data:
                                text = convert.table2csv(list(result.data))
                                Log.note("Result:\n{{data}}", data=text)
                        except Exception as e:
                            e = Except.wrap(e)
                            result.exception = Except(ERROR, "Problem with\n{{command|indent}}", command=command, cause=e)
                        finally:
                            signal.go()
                    else:
                        try:
                            self.db.execute(command)
                            self.db.commit()
                        except Exception as e:
                            e = Except.wrap(e)
                            e.cause = Except(
                                type=ERROR,
                                template="Bad call to Sqlite",
                                trace=trace
                            )
                            Log.warning("Failure to execute", cause=e)

        except Exception as e:
            if not please_stop:
                Log.error("Problem with sql thread", e)
        finally:
            if DEBUG:
                Log.note("Database is closed")
            self.db.commit()
            self.db.close()

    def quote_column(self, column_name, table=None):
        return quote_column(column_name, table)

    def quote_value(self, value):
        return quote_value(value)
예제 #8
0
파일: meta.py 프로젝트: rv404674/TUID
class ElasticsearchMetadata(Namespace):
    """
    MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER
    """

    @override
    def __new__(cls, kwargs, *args, **_kwargs):
        es_cluster = elasticsearch.Cluster(kwargs)
        output = known_clusters.get(id(es_cluster))
        if output is None:
            output = object.__new__(cls)
            known_clusters[id(es_cluster)] = output
        return output

    @override
    def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)

        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.index_to_alias = Relation_usingList()

        self.es_metadata = Null
        self.metadata_last_updated = Date.now() - OLD_METADATA

        self.meta = Data()
        self.meta.columns = ColumnList()

        self.alias_to_query_paths = {
            "meta.columns": [['.']],
            "meta.tables": [['.']]
        }
        self.alias_last_updated = {
            "meta.columns": Date.now(),
            "meta.tables": Date.now()
        }
        table_columns = metadata_tables()
        self.meta.tables = ListContainer(
            "meta.tables",
            [
                # TableDesc("meta.columns", None, ".", Date.now()),
                # TableDesc("meta.tables", None, ".", Date.now())
            ],
            jx_base.Schema(".", table_columns)
        )
        self.meta.columns.extend(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return

    @property
    def namespace(self):
        return self.meta.columns.namespace

    @property
    def url(self):
        return self.es_cluster.url / self.default_name.replace(".", "/")

    def _reload_columns(self, table_desc):
        """
        :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS)
        :return:
        """
        # FIND ALL INDEXES OF ALIAS
        es_last_updated = self.es_cluster.metatdata_last_updated

        alias = table_desc.name
        canonical_index = self.es_cluster.get_best_matching_index(alias).index
        update_required = not (table_desc.timestamp < es_last_updated)
        metadata = self.es_cluster.get_metadata(force=update_required)

        indexes = self.index_to_alias.get_domain(alias)
        props = [
            (self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties)
            for i, d in metadata.indices.items()
            if i in indexes
            for t, m in [_get_best_type_from_mapping(d.mappings)]
        ]

        # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT
        dirty = False
        all_comparisions = list(jx.pairwise(props)) + list(jx.pairwise(jx.reverse(props)))
        # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE
        for (i1, t1, p1), (i2, t2, p2) in all_comparisions:
            diff = elasticsearch.diff_schema(p2, p1)
            if not self.settings.read_only:
                for d in diff:
                    dirty = True
                    i1.add_property(*d)
        meta = self.es_cluster.get_metadata(force=dirty).indices[canonical_index]

        data_type, mapping = _get_best_type_from_mapping(meta.mappings)
        mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"}
        self._parse_properties(alias, mapping, meta)
        table_desc.timestamp = es_last_updated

    def _parse_properties(self, alias, mapping, meta):
        abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties)
        if any(c.cardinality == 0 and c.names['.'] != '_id' for c in abs_columns):
            Log.warning(
                "Some columns are not stored {{names}}",
                names=[
                    ".".join((c.es_index, c.names['.']))
                    for c in abs_columns
                    if c.cardinality == 0
                ]
            )

        with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(SELF_PATH)
            query_paths.append(ROOT_PATH)
            self.alias_to_query_paths[alias] = query_paths
            for i in self.index_to_alias.get_domain(alias):
                self.alias_to_query_paths[i] = query_paths

            # ADD RELATIVE NAMES
            for abs_column in abs_columns:
                abs_column.last_updated = None
                abs_column.jx_type = jx_type(abs_column)
                for query_path in query_paths:
                    abs_column.names[query_path[0]] = relative_field(abs_column.names["."], query_path[0])
                self.todo.add(self.meta.columns.add(abs_column))
        pass

    def query(self, _query):
        return self.meta.columns.query(QueryOp(set_default(
            {
                "from": self.meta.columns,
                "sort": ["table", "name"]
            },
            _query.__data__()
        )))

    def _find_alias(self, name):
        if self.metadata_last_updated < self.es_cluster.metatdata_last_updated:
            for a in self.es_cluster.get_aliases():
                self.index_to_alias[a.index] = coalesce(a.alias, a.index)
                self.alias_last_updated.setdefault(a.alias, Date.MIN)
        if name in self.alias_last_updated:
            return name
        else:
            return self.index_to_alias[name]

    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        root_table_name = table_path[0]

        alias = self._find_alias(root_table_name)
        if not alias:
            self.es_cluster.get_metadata(force=True)
            alias = self._find_alias(root_table_name)
            if not alias:
                Log.error("{{table|quote}} does not exist", table=table_name)

        try:
            last_update = MAX([
                self.es_cluster.index_last_updated[i]
                for i in self.index_to_alias.get_domain(alias)
            ])

            table = self.get_table(alias)[0]
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = TableDesc(
                    name=alias,
                    url=None,
                    query_path=['.'],
                    timestamp=Date.MIN
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._reload_columns(table)
            elif force or table.timestamp < last_update:
                self._reload_columns(table)

            columns = self.meta.columns.find(alias, column_name)
            columns = jx.sort(columns, "names.\\.")
            # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
            while len(self.todo) and not all(columns.get("last_updated")):
                if DEBUG:
                    if len(columns) > 10:
                        Log.note("waiting for {{num}} columns to update", num=len([c for c in columns if not c.last_updated]))
                    else:
                        Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated])
                Till(seconds=1).wait()
            return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        return []

    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if column.es_index in self.index_does_not_exist:
            return

        if column.jx_type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.columns),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            if column.es_index == "meta.tables":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.tables),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": {"filter": {"match_all": {}}}
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = max(1001, count)
                multi = 1001
            elif column.es_column == "_id":
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "query": {"match_all": {}},
                    "size": 0
                })
                count = cardinality = result.hits.total
                multi = 1
            elif column.es_type == BOOLEAN:
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": _counting_query(column)
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = 2
                multi = 1
            else:
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": _counting_query(column),
                        "multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}}
                    },
                    "size": 0
                })
                agg_results = result.aggregations
                count = result.hits.total
                cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count)
                multi = int(coalesce(agg_results.multi.value, 1))
                if cardinality == None:
                   Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                self.meta.columns.update({
                    "set": {
                        "count": cardinality,
                        "cardinality": cardinality,
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {"path": column.nested_path[0]},
                    "aggs": {"_nested": {"terms": {"field": column.es_column}}}
                }
            elif cardinality == 0:
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}}

            result = self.es_cluster.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            self.meta.columns.update({
                "set": {
                    "count": count,
                    "cardinality": cardinality,
                    "multi": multi,
                    "partitions": parts,
                    "last_updated": Date.now()
                },
                "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
            })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            e = Except.wrap(e)
            TEST_TABLE = "testdata"
            is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"])
            is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE))
            if is_missing_index and is_test_table:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                self.meta.columns.update({
                    "clear": ".",
                    "where": {"eq": {"es_index": column.es_index}}
                })
                self.index_does_not_exist.add(column.es_index)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)

    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    old_columns = [
                        c
                        for c in self.meta.columns
                        if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.jx_type not in STRUCT
                    ]
                    if old_columns:
                        DEBUG and Log.note(
                            "Old columns {{names|json}} last updated {{dates|json}}",
                            names=wrap(old_columns).es_column,
                            dates=[Date(t).format() for t in wrap(old_columns).last_updated]
                        )
                        self.todo.extend(old_columns)
                        # TEST CONSISTENCY
                        for c, d in product(list(self.todo.queue), list(self.todo.queue)):
                            if c.es_column == d.es_column and c.es_index == d.es_index and c != d:
                                Log.error("")
                    else:
                        DEBUG and Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10*MINUTE).seconds))
                if column:
                    if column is THREAD_STOP:
                        continue

                    with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG):
                        if column.es_index in self.index_does_not_exist:
                            self.meta.columns.update({
                                "clear": ".",
                                "where": {"eq": {"es_index": column.es_index}}
                            })
                            continue
                        if column.jx_type in STRUCT or column.es_column.endswith("." + EXISTS_TYPE):
                            column.last_updated = Date.now()
                            continue
                        elif column.last_updated >= Date.now()-TOO_OLD:
                            continue
                        try:
                            self._update_cardinality(column)
                            (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column)
                        except Exception as e:
                            if '"status":404' in e:
                                self.meta.columns.update({
                                    "clear": ".",
                                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                                })
                            else:
                                Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)

    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == THREAD_STOP:
                break

            if c.last_updated >= Date.now()-TOO_OLD:
                continue

            with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": c}, silent=not DEBUG, too_long=0.05):
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })

    def get_table(self, name):
        if name == "meta.columns":
            return self.meta.columns

            # return self.meta.columns
        with self.meta.tables.locker:
            return wrap([t for t in self.meta.tables.data if t.name == name])

    def get_snowflake(self, fact_table_name):
        return Snowflake(fact_table_name, self)

    def get_schema(self, name):
        if name == "meta.columns":
            return self.meta.columns.schema
        query_path = split_field(name)
        root, rest = query_path[0], join_field(query_path[1:])
        return self.get_snowflake(root).get_schema(rest)
예제 #9
0
class Celery(object):

    _fixups = None
    _pool = None

    def __init__(
        self,
        name,
        broker=None,
        include=None,
        **kwargs
    ):
        self.Task = MethodCaller

        self.name = name
        self.request_queue = Queue(name=name+" requests")
        self.response_queue = Queue(name=name+" responses")
        self.kwargs = kwargs
        self.include = include
        self.broker = broker
        self._config = {}
        self._tasks = {}
        self.on_init()
        self.response_worker = Thread.run("response worker", self._response_worker)
        self.responses = {}
        self.responses_lock = Lock()
        self.id_lock = Lock()
        self.next_id = 1
        self.worker = Worker(self.request_queue, self.response_queue, celery=self)

    def _response_worker(self, please_stop):
        while not please_stop:
            try:
                encoded_mail = self.response_queue.pop(till=please_stop)
                mail = json2value(encoded_mail)
                Log.note("got response for {{id}}", id=mail.request.id)
                with self.responses_lock:
                    try:
                        async_response = self.responses[mail.request.id]
                        async_response.mail = set_default(mail, async_response.mail)
                        if mail.status in states.READY_STATES:
                            async_response._ready.go()
                    except Exception as e:
                        Log.warning("not expected", cause=e)


            except Exception as e:
                Log.warning("not expected", cause=e)

    def __enter__(self):
        return self

    def __exit__(self, *exc_info):
        self.close()

    def close(self):
        self.response_worker.stop()

    def on_init(self):
        """Optional callback called at init."""
        pass

    def start(self, argv=None):
        pass

    @property
    def conf(self):
        return self._config

    def task(self, **opts):
        opts = wrap(opts)
        this = self

        def dec(fun):
            # GET THE PARAMETER NAMES FOR args
            arg_names = fun.func_code.co_varnames[:fun.func_code.co_argcount]
            if arg_names and arg_names[0] == 'self':
                arg_names = arg_names[1:]
            this._tasks[opts.name] = fun

            def async(args, kwargs=None, *_args, **_kwargs):
                kwargs = set_default(kwargs, dict(zip(arg_names, args)))

                with self.id_lock:
                    id = self.next_id
                    self.next_id += 1

                mail = deepcopy(Data(
                    status=states.PENDING,
                    caller={
                        # "stack": extract_stack(1)
                    },
                    sender=set_default(_kwargs, opts),
                    message=kwargs,
                    request=set_default({"id": id})
                ))

                output = AsyncResult(id, mail=mail, app=self)
                with self.responses_lock:
                    self.responses[id] = output
                self.request_queue.add(value2json(mail))
                Log.note("Added {{id}} ({{name}}) to request queue\n{{request}}", id=id, name=opts.name, request=mail)
                return output

            def send_message(*args, **kwargs):
                return async(args, kwargs)

            def revoke(terminate=True, signal='SIGINT'):
                pass

            setattr(send_message, "delay", send_message)
            setattr(send_message, "revoke", revoke)
            setattr(send_message, "apply_async", async)

            return send_message
        return dec

    def get_result(self, id):
        with self.responses_lock:
            response, self.responses[id] = self.responses[id], None
        response._ready.wait()
        if response.status in states.EXCEPTION_STATES:
            Log.error("bad response", cause=response.mail.result)
        else:
            return response.mail.result
예제 #10
0
class Sqlite(DB):
    """
    Allows multi-threaded access
    Loads extension functions (like SQRT)
    """
    @override
    def __init__(
        self,
        filename=None,
        db=None,
        get_trace=None,
        upgrade=False,
        load_functions=False,
        debug=False,
        kwargs=None,
    ):
        """
        :param filename:  FILE TO USE FOR DATABASE
        :param db: AN EXISTING sqlite3 DB YOU WOULD LIKE TO USE (INSTEAD OF USING filename)
        :param get_trace: GET THE STACK TRACE AND THREAD FOR EVERY DB COMMAND (GOOD FOR DEBUGGING)
        :param upgrade: REPLACE PYTHON sqlite3 DLL WITH MORE RECENT ONE, WITH MORE FUNCTIONS (NOT WORKING)
        :param load_functions: LOAD EXTENDED MATH FUNCTIONS (MAY REQUIRE upgrade)
        :param kwargs:
        """
        global _upgraded
        global _sqlite3

        self.settings = kwargs
        if not _upgraded:
            if upgrade:
                _upgrade()
            _upgraded = True
            import sqlite3 as _sqlite3

            _ = _sqlite3

        self.filename = File(filename).abspath if filename else None
        if known_databases.get(self.filename):
            Log.error(
                "Not allowed to create more than one Sqlite instance for {{file}}",
                file=self.filename,
            )
        self.debug = debug | DEBUG

        # SETUP DATABASE
        self.debug and Log.note("Sqlite version {{version}}",
                                version=_sqlite3.sqlite_version)
        try:
            if db == None:
                self.db = _sqlite3.connect(
                    database=coalesce(self.filename, ":memory:"),
                    check_same_thread=False,
                    isolation_level=None,
                )
            else:
                self.db = db
        except Exception as e:
            Log.error("could not open file {{filename}}",
                      filename=self.filename,
                      cause=e)
        self.upgrade = upgrade
        load_functions and self._load_functions()

        self.locker = Lock()
        self.available_transactions = [
        ]  # LIST OF ALL THE TRANSACTIONS BEING MANAGED
        self.queue = Queue(
            "sql commands"
        )  # HOLD (command, result, signal, stacktrace) TUPLES

        self.get_trace = coalesce(get_trace, TRACE)
        self.closed = False

        # WORKER VARIABLES
        self.transaction_stack = [
        ]  # THE TRANSACTION OBJECT WE HAVE PARTIALLY RUN
        self.last_command_item = (
            None
        )  # USE THIS TO HELP BLAME current_transaction FOR HANGING ON TOO LONG
        self.too_long = None
        self.delayed_queries = []
        self.delayed_transactions = []
        self.worker = Thread.run("sqlite db thread", self._worker)

        self.debug and Log.note(
            "Sqlite version {{version}}",
            version=self.query("select sqlite_version()").data[0][0],
        )

    def _enhancements(self):
        def regex(pattern, value):
            return 1 if re.match(pattern + "$", value) else 0

        con = self.db.create_function("regex", 2, regex)

        class Percentile(object):
            def __init__(self, percentile):
                self.percentile = percentile
                self.acc = []

            def step(self, value):
                self.acc.append(value)

            def finalize(self):
                return percentile(self.acc, self.percentile)

        con.create_aggregate("percentile", 2, Percentile)

    def transaction(self):
        thread = Thread.current()
        parent = None
        with self.locker:
            for t in self.available_transactions:
                if t.thread is thread:
                    parent = t

        output = Transaction(self, parent=parent, thread=thread)
        self.available_transactions.append(output)
        return output

    def about(self, table_name):
        """
        :param table_name: TABLE IF INTEREST
        :return: SOME INFORMATION ABOUT THE TABLE
            (cid, name, dtype, notnull, dfft_value, pk) tuples
        """
        details = self.query("PRAGMA table_info" +
                             sql_iso(quote_column(table_name)))
        return details.data

    def query(self, command):
        """
        WILL BLOCK CALLING THREAD UNTIL THE command IS COMPLETED
        :param command: COMMAND FOR SQLITE
        :return: list OF RESULTS
        """
        if self.closed:
            Log.error("database is closed")

        signal = _allocate_lock()
        signal.acquire()
        result = Data()
        trace = get_stacktrace(1) if self.get_trace else None

        if self.get_trace:
            current_thread = Thread.current()
            with self.locker:
                for t in self.available_transactions:
                    if t.thread is current_thread:
                        Log.error(DOUBLE_TRANSACTION_ERROR)

        self.queue.add(CommandItem(command, result, signal, trace, None))
        signal.acquire()

        if result.exception:
            Log.error("Problem with Sqlite call", cause=result.exception)
        return result

    def close(self):
        """
        OPTIONAL COMMIT-AND-CLOSE
        IF THIS IS NOT DONE, THEN THE THREAD THAT SPAWNED THIS INSTANCE
        :return:
        """
        self.closed = True
        signal = _allocate_lock()
        signal.acquire()
        self.queue.add(CommandItem(COMMIT, None, signal, None, None))
        signal.acquire()
        self.worker.please_stop.go()
        return

    def __enter__(self):
        pass

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def _load_functions(self):
        global _load_extension_warning_sent
        library_loc = File.new_instance(sys.modules[__name__].__file__,
                                        "../..")
        full_path = File.new_instance(
            library_loc, "vendor/sqlite/libsqlitefunctions.so").abspath
        try:
            trace = get_stacktrace(0)[0]
            if self.upgrade:
                if os.name == "nt":
                    file = File.new_instance(
                        trace["file"],
                        "../../vendor/sqlite/libsqlitefunctions.so")
                else:
                    file = File.new_instance(
                        trace["file"],
                        "../../vendor/sqlite/libsqlitefunctions")

                full_path = file.abspath
                self.db.enable_load_extension(True)
                self.db.execute(
                    text(SQL_SELECT + "load_extension" +
                         sql_iso(quote_value(full_path))))
        except Exception as e:
            if not _load_extension_warning_sent:
                _load_extension_warning_sent = True
                Log.warning(
                    "Could not load {{file}}, doing without. (no SQRT for you!)",
                    file=full_path,
                    cause=e,
                )

    def create_new_functions(self):
        def regexp(pattern, item):
            reg = re.compile(pattern)
            return reg.search(item) is not None

        self.db.create_function("REGEXP", 2, regexp)

    def show_transactions_blocked_warning(self):
        blocker = self.last_command_item
        blocked = (self.delayed_queries + self.delayed_transactions)[0]

        Log.warning(
            "Query on thread {{blocked_thread|json}} at\n"
            "{{blocked_trace|indent}}"
            "is blocked by {{blocker_thread|json}} at\n"
            "{{blocker_trace|indent}}"
            "this message brought to you by....",
            blocker_trace=format_trace(blocker.trace),
            blocked_trace=format_trace(blocked.trace),
            blocker_thread=blocker.transaction.thread.name
            if blocker.transaction is not None else None,
            blocked_thread=blocked.transaction.thread.name
            if blocked.transaction is not None else None,
        )

    def _close_transaction(self, command_item):
        query, result, signal, trace, transaction = command_item

        transaction.end_of_life = True
        with self.locker:
            self.available_transactions.remove(transaction)
            assert transaction not in self.available_transactions

            old_length = len(self.transaction_stack)
            old_trans = self.transaction_stack[-1]
            del self.transaction_stack[-1]

            assert old_length - 1 == len(self.transaction_stack)
            assert old_trans
            assert old_trans not in self.transaction_stack
        if not self.transaction_stack:
            # NESTED TRANSACTIONS NOT ALLOWED IN sqlite3
            self.debug and Log.note(FORMAT_COMMAND, command=query)
            self.db.execute(query)

        has_been_too_long = False
        with self.locker:
            if self.too_long is not None:
                self.too_long, too_long = None, self.too_long
                # WE ARE CHEATING HERE: WE REACH INTO THE Signal MEMBERS AND REMOVE WHAT WE ADDED TO THE INTERNAL job_queue
                with too_long.lock:
                    has_been_too_long = bool(too_long)
                    too_long.job_queue = None

            # PUT delayed BACK ON THE QUEUE, IN THE ORDER FOUND, BUT WITH QUERIES FIRST
            if self.delayed_transactions:
                for c in reversed(self.delayed_transactions):
                    self.queue.push(c)
                del self.delayed_transactions[:]
            if self.delayed_queries:
                for c in reversed(self.delayed_queries):
                    self.queue.push(c)
                del self.delayed_queries[:]
        if has_been_too_long:
            Log.note("Transaction blockage cleared")

    def _worker(self, please_stop):
        try:
            # MAIN EXECUTION LOOP
            while not please_stop:
                command_item = self.queue.pop(till=please_stop)
                if command_item is None:
                    break
                try:
                    self._process_command_item(command_item)
                except Exception as e:
                    Log.warning("worker can not execute command", cause=e)
        except Exception as e:
            e = Except.wrap(e)
            if not please_stop:
                Log.warning("Problem with sql", cause=e)
        finally:
            self.closed = True
            self.debug and Log.note("Database is closed")
            self.db.close()

    def _process_command_item(self, command_item):
        query, result, signal, trace, transaction = command_item

        with Timer("SQL Timing", verbose=self.debug):
            if transaction is None:
                # THIS IS A TRANSACTIONLESS QUERY, DELAY IT IF THERE IS A CURRENT TRANSACTION
                if self.transaction_stack:
                    with self.locker:
                        if self.too_long is None:
                            self.too_long = Till(
                                seconds=TOO_LONG_TO_HOLD_TRANSACTION)
                            self.too_long.then(
                                self.show_transactions_blocked_warning)
                        self.delayed_queries.append(command_item)
                    return
            elif self.transaction_stack and self.transaction_stack[-1] not in [
                    transaction,
                    transaction.parent,
            ]:
                # THIS TRANSACTION IS NOT THE CURRENT TRANSACTION, DELAY IT
                with self.locker:
                    if self.too_long is None:
                        self.too_long = Till(
                            seconds=TOO_LONG_TO_HOLD_TRANSACTION)
                        self.too_long.then(
                            self.show_transactions_blocked_warning)
                    self.delayed_transactions.append(command_item)
                return
            else:
                # ENSURE THE CURRENT TRANSACTION IS UP TO DATE FOR THIS query
                if not self.transaction_stack:
                    # sqlite3 ALLOWS ONLY ONE TRANSACTION AT A TIME
                    self.debug and Log.note(FORMAT_COMMAND, command=BEGIN)
                    self.db.execute(BEGIN)
                    self.transaction_stack.append(transaction)
                elif transaction is not self.transaction_stack[-1]:
                    self.transaction_stack.append(transaction)
                elif transaction.exception and query is not ROLLBACK:
                    result.exception = Except(
                        context=ERROR,
                        template=
                        "Not allowed to continue using a transaction that failed",
                        cause=transaction.exception,
                        trace=trace,
                    )
                    signal.release()
                    return

                try:
                    transaction.do_all()
                except Exception as e:
                    # DEAL WITH ERRORS IN QUEUED COMMANDS
                    # WE WILL UNWRAP THE OUTER EXCEPTION TO GET THE CAUSE
                    err = Except(
                        context=ERROR,
                        template="Bad call to Sqlite3 while " + FORMAT_COMMAND,
                        params={"command": e.params.current.command},
                        cause=e.cause,
                        trace=e.params.current.trace,
                    )
                    transaction.exception = result.exception = err

                    if query in [COMMIT, ROLLBACK]:
                        self._close_transaction(
                            CommandItem(ROLLBACK, result, signal, trace,
                                        transaction))

                    signal.release()
                    return

            try:
                # DEAL WITH END-OF-TRANSACTION MESSAGES
                if query in [COMMIT, ROLLBACK]:
                    self._close_transaction(command_item)
                    return

                # EXECUTE QUERY
                self.last_command_item = command_item
                self.debug and Log.note(FORMAT_COMMAND, command=query)
                curr = self.db.execute(text(query))
                result.meta.format = "table"
                result.header = ([d[0] for d in curr.description]
                                 if curr.description else None)
                result.data = curr.fetchall()
                if self.debug and result.data:
                    csv = convert.table2csv(list(result.data))
                    Log.note("Result:\n{{data|limit(100)|indent}}", data=csv)
            except Exception as e:
                e = Except.wrap(e)
                err = Except(
                    context=ERROR,
                    template="Bad call to Sqlite while " + FORMAT_COMMAND,
                    params={"command": query},
                    trace=trace,
                    cause=e,
                )
                result.exception = err
                if transaction:
                    transaction.exception = err
            finally:
                signal.release()
예제 #11
0
파일: clogger.py 프로젝트: rv404674/TUID
class Clogger:

    # Singleton of the look-ahead scanner Clogger
    SINGLE_CLOGGER = None
    def __new__(cls, *args, **kwargs):
        if cls.SINGLE_CLOGGER is None:
            cls.SINGLE_CLOGGER = object.__new__(cls)
        return cls.SINGLE_CLOGGER


    def __init__(self, conn=None, tuid_service=None, start_workers=True, new_table=False, kwargs=None):
        try:
            self.config = kwargs
            self.conn = conn if conn else sql.Sql(self.config.database.name)
            self.hg_cache = HgMozillaOrg(kwargs=self.config.hg_cache, use_cache=True) if self.config.hg_cache else Null

            self.tuid_service = tuid_service if tuid_service else tuid.service.TUIDService(
                kwargs=self.config.tuid, conn=self.conn, clogger=self
            )
            self.rev_locker = Lock()
            self.working_locker = Lock()

            if new_table:
                with self.conn.transaction() as t:
                    t.execute("DROP TABLE IF EXISTS csetLog")

            self.init_db()
            self.next_revnum = coalesce(self.conn.get_one("SELECT max(revnum)+1 FROM csetLog")[0], 1)
            self.csets_todo_backwards = Queue(name="Clogger.csets_todo_backwards")
            self.deletions_todo = Queue(name="Clogger.deletions_todo")
            self.maintenance_signal = Signal(name="Clogger.maintenance_signal")

            if 'tuid' in self.config:
                self.config = self.config.tuid

            self.disable_backfilling = False
            self.disable_tipfilling = False
            self.disable_deletion = False
            self.disable_maintenance = False

            self.backfill_thread = None
            self.tipfill_thread = None
            self.deletion_thread = None
            self.maintenance_thread = None

            # Make sure we are filled before allowing queries
            numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0]
            if numrevs < MINIMUM_PERMANENT_CSETS:
                Log.note("Filling in csets to hold {{minim}} csets.", minim=MINIMUM_PERMANENT_CSETS)
                oldest_rev = 'tip'
                with self.conn.transaction() as t:
                    tmp = t.query("SELECT min(revnum), revision FROM csetLog").data[0][1]
                    if tmp:
                        oldest_rev = tmp
                self._fill_in_range(
                    MINIMUM_PERMANENT_CSETS - numrevs,
                    oldest_rev,
                    timestamp=False
                )

            Log.note(
                "Table is filled with atleast {{minim}} entries.",
                minim=MINIMUM_PERMANENT_CSETS
            )

            if start_workers:
                self.start_workers()
        except Exception as e:
            Log.warning("Cannot setup clogger: {{cause}}", cause=str(e))


    def start_backfilling(self):
        if not self.backfill_thread:
            self.backfill_thread = Thread.run('clogger-backfill', self.fill_backward_with_list)


    def start_tipfillling(self):
        if not self.tipfill_thread:
            self.tipfill_thread = Thread.run('clogger-tip', self.fill_forward_continuous)


    def start_maintenance(self):
        if not self.maintenance_thread:
            self.maintenance_thread = Thread.run('clogger-maintenance', self.csetLog_maintenance)


    def start_deleter(self):
        if not self.deletion_thread:
            self.deletion_thread = Thread.run('clogger-deleter', self.csetLog_deleter)


    def start_workers(self):
        self.start_tipfillling()
        self.start_backfilling()
        self.start_maintenance()
        self.start_deleter()
        Log.note("Started clogger workers.")


    def init_db(self):
        with self.conn.transaction() as t:
            t.execute('''
            CREATE TABLE IF NOT EXISTS csetLog (
                revnum         INTEGER PRIMARY KEY,
                revision       CHAR(12) NOT NULL,
                timestamp      INTEGER
            );''')


    def disable_all(self):
        self.disable_tipfilling = True
        self.disable_backfilling = True
        self.disable_maintenance = True
        self.disable_deletion = True


    def revnum(self):
        """
        :return: max revnum that was added
        """
        return coalesce(self.conn.get_one("SELECT max(revnum) as revnum FROM csetLog")[0], 0)


    def get_tip(self, transaction):
        return transaction.get_one(
            "SELECT max(revnum) as revnum, revision FROM csetLog"
        )


    def get_tail(self, transaction):
        return transaction.get_one(
            "SELECT min(revnum) as revnum, revision FROM csetLog"
        )


    def _get_clog(self, clog_url):
        try:
            Log.note("Searching through changelog {{url}}", url=clog_url)
            clog_obj = http.get_json(clog_url, retry=RETRY)
            return clog_obj
        except Exception as e:
            Log.error(
                "Unexpected error getting changset-log for {{url}}: {{error}}",
                url=clog_url,
                error=e
            )


    def _get_one_revision(self, transaction, cset_entry):
        # Returns a single revision if it exists
        _, rev, _ = cset_entry
        return transaction.get_one("SELECT revision FROM csetLog WHERE revision=?", (rev,))


    def _get_one_revnum(self, transaction, rev):
        # Returns a single revnum if it exists
        return transaction.get_one("SELECT revnum FROM csetLog WHERE revision=?", (rev,))


    def _get_revnum_range(self, transaction, revnum1, revnum2):
        # Returns a range of revision numbers (that is inclusive)
        high_num = max(revnum1, revnum2)
        low_num = min(revnum1, revnum2)

        return transaction.query(
            "SELECT revnum, revision FROM csetLog WHERE "
            "revnum >= " + str(low_num) + " AND revnum <= " + str(high_num)
        ).data


    def recompute_table_revnums(self):
        '''
        Recomputes the revnums for the csetLog table
        by creating a new table, and copying csetLog to
        it. The INTEGER PRIMARY KEY in the temp table auto increments
        as rows are added.

        IMPORTANT: Only call this after acquiring the
                   lock `self.working_locker`.
        :return:
        '''
        with self.conn.transaction() as t:
            t.execute('''
            CREATE TABLE temp (
                revnum         INTEGER PRIMARY KEY,
                revision       CHAR(12) NOT NULL,
                timestamp      INTEGER
            );''')

            t.execute(
                "INSERT INTO temp (revision, timestamp) "
                "SELECT revision, timestamp FROM csetlog ORDER BY revnum ASC"
            )

            t.execute("DROP TABLE csetLog;")
            t.execute("ALTER TABLE temp RENAME TO csetLog;")


    def check_for_maintenance(self):
        '''
        Returns True if the maintenance worker should be run now,
        and False otherwise.
        :return:
        '''
        numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0]
        Log.note("Number of csets in csetLog table: {{num}}", num=numrevs)
        if numrevs >= SIGNAL_MAINTENANCE_CSETS:
            return True
        return False


    def add_cset_entries(self, ordered_rev_list, timestamp=False, number_forward=True):
        '''
        Adds a list of revisions to the table. Assumes ordered_rev_list is an ordered
        based on how changesets are found in the changelog. Going forwards or backwards is dealt
        with by flipping the list
        :param ordered_cset_list: Order given from changeset log searching.
        :param timestamp: If false, records are kept indefinitely
                          but if holes exist: (delete, None, delete, None)
                          those delete's with None's around them
                          will not be deleted.
        :param numbered: If True, this function will number the revision list
                         by going forward from max(revNum), else it'll go backwards
                         from revNum, then add X to all revnums and self.next_revnum
                         where X is the length of ordered_rev_list
        :return:
        '''
        with self.conn.transaction() as t:
            current_min = t.get_one("SELECT min(revnum) FROM csetlog")[0]
            current_max = t.get_one("SELECT max(revnum) FROM csetlog")[0]
            if not current_min or not current_max:
                current_min = 0
                current_max = 0

            direction = -1
            start = current_min - 1
            if number_forward:
                direction = 1
                start = current_max + 1
                ordered_rev_list = ordered_rev_list[::-1]

            insert_list = [
                (
                    start + direction * count,
                    rev,
                    int(time.time()) if timestamp else -1
                )
                for count, rev in enumerate(ordered_rev_list)
            ]

            # In case of overlapping requests
            fmt_insert_list = []
            for cset_entry in insert_list:
                tmp = self._get_one_revision(t, cset_entry)
                if not tmp:
                    fmt_insert_list.append(cset_entry)

            for _, tmp_insert_list in jx.groupby(fmt_insert_list, size=SQL_CSET_BATCH_SIZE):
                t.execute(
                    "INSERT INTO csetLog (revnum, revision, timestamp)" +
                    " VALUES " +
                    sql_list(
                        quote_set((revnum, revision, timestamp))
                        for revnum, revision, timestamp in tmp_insert_list
                    )
                )

            # Move the revision numbers forward if needed
            self.recompute_table_revnums()

        # Start a maintenance run if needed
        if self.check_for_maintenance():
            Log.note("Scheduling maintenance run on clogger.")
            self.maintenance_signal.go()


    def _fill_in_range(self, parent_cset, child_cset, timestamp=False, number_forward=True):
        '''
        Fills cset logs in a certain range. 'parent_cset' can be an int and in that case,
        we get that many changesets instead. If parent_cset is an int, then we consider
        that we are going backwards (number_forward is False) and we ignore the first
        changeset of the first log, and we ignore the setting for number_forward.
        Otherwise, we continue until we find the given 'parent_cset'.
        :param parent_cset:
        :param child_cset:
        :param timestamp:
        :param number_forward:
        :return:
        '''
        csets_to_add = []
        found_parent = False
        find_parent = False
        if type(parent_cset) != int:
            find_parent = True
        elif parent_cset >= MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG:
            Log.warning(
                "Requested number of new changesets {{num}} is too high. "
                "Max number that can be requested is {{maxnum}}.",
                num=parent_cset,
                maxnum=MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG
            )
            return None

        csets_found = 0
        clogs_seen = 0
        final_rev = child_cset
        while not found_parent and clogs_seen < MAX_BACKFILL_CLOGS:
            clog_url = str(HG_URL) + "/" + self.config.hg.branch + "/json-log/" + final_rev
            clog_obj = self._get_clog(clog_url)
            clog_csets_list = list(clog_obj['changesets'])
            for clog_cset in clog_csets_list[:-1]:
                if not number_forward and csets_found <= 0:
                    # Skip this entry it already exists
                    csets_found += 1
                    continue

                nodes_cset = clog_cset['node'][:12]
                if find_parent:
                    if nodes_cset == parent_cset:
                        found_parent = True
                        if not number_forward:
                            # When going forward this entry is
                            # the given parent
                            csets_to_add.append(nodes_cset)
                        break
                else:
                    if csets_found + 1 > parent_cset:
                        found_parent = True
                        if not number_forward:
                            # When going forward this entry is
                            # the given parent (which is supposed
                            # to already exist)
                            csets_to_add.append(nodes_cset)
                        break
                    csets_found += 1
                csets_to_add.append(nodes_cset)
            if found_parent == True:
                break

            clogs_seen += 1
            final_rev = clog_csets_list[-1]['node'][:12]

        if found_parent:
            self.add_cset_entries(csets_to_add, timestamp=timestamp, number_forward=number_forward)
        else:
            Log.warning(
                "Couldn't find the end of the request for {{request}}. "
                "Max number that can be requested through _fill_in_range is {{maxnum}}.",
                request={
                    'parent_cset': parent_cset,
                    'child_cset':child_cset,
                    'number_forward': number_forward
                },
                maxnum=MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG
            )
            return None
        return csets_to_add


    def initialize_to_range(self, old_rev, new_rev, delete_old=True):
        '''
        Used in service testing to get to very old
        changesets quickly.
        :param old_rev: The oldest revision to keep
        :param new_rev: The revision to start searching from
        :return:
        '''
        old_settings = [
            self.disable_tipfilling,
            self.disable_backfilling,
            self.disable_maintenance,
            self.disable_deletion
        ]
        self.disable_tipfilling = True
        self.disable_backfilling = True
        self.disable_maintenance = True
        self.disable_deletion = True

        old_rev = old_rev[:12]
        new_rev = new_rev[:12]

        with self.working_locker:
            if delete_old:
                with self.conn.transaction() as t:
                    t.execute("DELETE FROM csetLog")
            with self.conn.transaction() as t:
                t.execute(
                    "INSERT INTO csetLog (revision, timestamp) VALUES " +
                    quote_set((new_rev, -1))
                )
            self._fill_in_range(old_rev, new_rev, timestamp=True, number_forward=False)

        self.disable_tipfilling = old_settings[0]
        self.disable_backfilling = old_settings[1]
        self.disable_maintenance = old_settings[2]
        self.disable_deletion = old_settings[3]


    def fill_backward_with_list(self, please_stop=None):
        '''
        Expects requests of the tuple form: (parent_cset, timestamp)
        parent_cset can be an int X to go back by X changesets, or
        a string to search for going backwards in time. If timestamp
        is false, no timestamps will be added to the entries.
        :param please_stop:
        :return:
        '''
        while not please_stop:
            try:
                request = self.csets_todo_backwards.pop(till=please_stop)
                if please_stop:
                    break

                # If backfilling is disabled, all requests
                # are ignored.
                if self.disable_backfilling:
                    Till(till=CSET_BACKFILL_WAIT_TIME).wait()
                    continue

                if request:
                    parent_cset, timestamp = request
                else:
                    continue

                with self.working_locker:
                    with self.conn.transaction() as t:
                        parent_revnum = self._get_one_revnum(t, parent_cset)
                    if parent_revnum:
                        continue

                    with self.conn.transaction() as t:
                        _, oldest_revision = self.get_tail(t)

                    self._fill_in_range(
                        parent_cset,
                        oldest_revision,
                        timestamp=timestamp,
                        number_forward=False
                    )
                Log.note("Finished {{cset}}", cset=parent_cset)
            except Exception as e:
                Log.warning("Unknown error occurred during backfill: ", cause=e)


    def update_tip(self):
        '''
        Returns False if the tip is already at the newest, or True
        if an update has taken place.
        :return:
        '''
        clog_obj = self._get_clog(
            str(HG_URL) + "/" + self.config.hg.branch + "/json-log/tip"
        )

        # Get current tip in DB
        with self.conn.transaction() as t:
            _, newest_known_rev = self.get_tip(t)

        # If we are still at the newest, wait for CSET_TIP_WAIT_TIME seconds
        # before checking again.
        first_clog_entry = clog_obj['changesets'][0]['node'][:12]
        if newest_known_rev == first_clog_entry:
            return False

        csets_to_gather = None
        if not newest_known_rev:
            Log.note(
                "No revisions found in table, adding {{minim}} entries...",
                minim=MINIMUM_PERMANENT_CSETS
            )
            csets_to_gather = MINIMUM_PERMANENT_CSETS

        found_newest_known = False
        csets_to_add = []
        csets_found = 0
        clogs_seen = 0
        Log.note("Found new revisions. Updating csetLog tip to {{rev}}...", rev=first_clog_entry)
        while not found_newest_known and clogs_seen < MAX_TIPFILL_CLOGS:
            clog_csets_list = list(clog_obj['changesets'])
            for clog_cset in clog_csets_list[:-1]:
                nodes_cset = clog_cset['node'][:12]
                if not csets_to_gather:
                    if nodes_cset == newest_known_rev:
                        found_newest_known = True
                        break
                else:
                    if csets_found >= csets_to_gather:
                        found_newest_known = True
                        break
                csets_found += 1
                csets_to_add.append(nodes_cset)
            if not found_newest_known:
                # Get the next page
                clogs_seen += 1
                final_rev = clog_csets_list[-1]['node'][:12]
                clog_url = str(HG_URL) + "/" + self.config.hg.branch + "/json-log/" + final_rev
                clog_obj = self._get_clog(clog_url)

        if clogs_seen >= MAX_TIPFILL_CLOGS:
            Log.error(
                "Too many changesets, can't find last tip or the number is too high: {{rev}}. "
                "Maximum possible to request is {{maxnum}}",
                rev=coalesce(newest_known_rev, csets_to_gather),
                maxnum=MAX_TIPFILL_CLOGS * CHANGESETS_PER_CLOG
            )
            return False

        with self.working_locker:
            Log.note("Adding {{csets}}", csets=csets_to_add)
            self.add_cset_entries(csets_to_add, timestamp=False)
        return True


    def fill_forward_continuous(self, please_stop=None):
        while not please_stop:
            try:
                while not please_stop and not self.disable_tipfilling and self.update_tip():
                    pass
                (please_stop | Till(seconds=CSET_TIP_WAIT_TIME)).wait()
            except Exception as e:
                Log.warning("Unknown error occurred during tip filling:", cause=e)


    def csetLog_maintenance(self, please_stop=None):
        '''
        Handles deleting old csetLog entries and timestamping
        revisions once they pass the length for permanent
        storage for deletion later.
        :param please_stop:
        :return:
        '''
        while not please_stop:
            try:
                # Wait until something signals the maintenance cycle
                # to begin (or end).
                (self.maintenance_signal | please_stop).wait()

                if please_stop:
                    break
                if self.disable_maintenance:
                    continue

                Log.warning(
                    "Starting clog maintenance. Since this doesn't start often, "
                    "we need to explicitly see when it's started with this warning."
                )

                # Reset signal so we don't request
                # maintenance infinitely.
                with self.maintenance_signal.lock:
                    self.maintenance_signal._go = False

                with self.working_locker:
                    all_data = None
                    with self.conn.transaction() as t:
                        all_data = sorted(
                            t.get("SELECT revnum, revision, timestamp FROM csetLog"),
                            key=lambda x: int(x[0])
                        )

                    # Restore maximum permanents (if overflowing)
                    new_data = []
                    modified = False
                    for count, (revnum, revision, timestamp) in enumerate(all_data[::-1]):
                        if count < MINIMUM_PERMANENT_CSETS:
                            if timestamp != -1:
                                modified = True
                                new_data.append((revnum, revision, -1))
                            else:
                                new_data.append((revnum, revision, timestamp))
                        elif type(timestamp) != int or timestamp == -1:
                            modified = True
                            new_data.append((revnum, revision, int(time.time())))
                        else:
                            new_data.append((revnum, revision, timestamp))

                    # Delete annotations at revisions with timestamps
                    # that are too old. The csetLog entries will have
                    # their timestamps reset here.
                    new_data1 = []
                    annrevs_to_del = []
                    current_time = time.time()
                    for count, (revnum, revision, timestamp) in enumerate(new_data[::-1]):
                        new_timestamp = timestamp
                        if timestamp != -1:
                            if current_time >= timestamp + TIME_TO_KEEP_ANNOTATIONS.seconds:
                                modified = True
                                new_timestamp = current_time
                                annrevs_to_del.append(revision)
                        new_data1.append((revnum, revision, new_timestamp))

                    if len(annrevs_to_del) > 0:
                        # Delete any latestFileMod and annotation entries
                        # that are too old.
                        Log.note(
                            "Deleting annotations and latestFileMod for revisions for being "
                            "older than {{oldest}}: {{revisions}}",
                            oldest=TIME_TO_KEEP_ANNOTATIONS,
                            revisions=annrevs_to_del
                        )
                        with self.conn.transaction() as t:
                            t.execute(
                                "DELETE FROM latestFileMod WHERE revision IN " +
                                quote_set(annrevs_to_del)
                            )
                            t.execute(
                                "DELETE FROM annotations WHERE revision IN " +
                                quote_set(annrevs_to_del)
                            )

                    # Delete any overflowing entries
                    new_data2 = new_data1
                    reved_all_data = all_data[::-1]
                    deleted_data = reved_all_data[MAXIMUM_NONPERMANENT_CSETS:]
                    delete_overflowing_revstart = None
                    if len(deleted_data) > 0:
                        _, delete_overflowing_revstart, _ = deleted_data[0]
                        new_data2 = set(all_data) - set(deleted_data)

                        # Update old frontiers if requested, otherwise
                        # they will all get deleted by the csetLog_deleter
                        # worker
                        if UPDATE_VERY_OLD_FRONTIERS:
                            _, max_revision, _ = all_data[-1]
                            for _, revision, _ in deleted_data:
                                with self.conn.transaction() as t:
                                    old_files = t.get(
                                        "SELECT file FROM latestFileMod WHERE revision=?",
                                        (revision,)
                                    )
                                if old_files is None or len(old_files) <= 0:
                                    continue

                                self.tuid_service.get_tuids_from_files(
                                    old_files,
                                    max_revision,
                                    going_forward=True,
                                )

                                still_exist = True
                                while still_exist and not please_stop:
                                    Till(seconds=TUID_EXISTENCE_WAIT_TIME).wait()
                                    with self.conn.transaction() as t:
                                        old_files = t.get(
                                            "SELECT file FROM latestFileMod WHERE revision=?",
                                            (revision,)
                                        )
                                    if old_files is None or len(old_files) <= 0:
                                        still_exist = False

                    # Update table and schedule a deletion
                    if modified:
                        with self.conn.transaction() as t:
                            insert_into_db_chunked(
                                t,
                                new_data2,
                                "INSERT OR REPLACE INTO csetLog (revnum, revision, timestamp) VALUES "
                            )
                    if not deleted_data:
                        continue

                    Log.note("Scheduling {{num_csets}} for deletion", num_csets=len(deleted_data))
                    self.deletions_todo.add(delete_overflowing_revstart)
            except Exception as e:
                Log.warning("Unexpected error occured while maintaining csetLog, continuing to try: ", cause=e)
        return


    def csetLog_deleter(self, please_stop=None):
        '''
        Deletes changesets from the csetLog table
        and also changesets from the annotation table
        that have revisions matching the given changesets.
        Accepts lists of csets from self.deletions_todo.
        :param please_stop:
        :return:
        '''
        while not please_stop:
            try:
                request = self.deletions_todo.pop(till=please_stop)
                if please_stop:
                    break

                # If deletion is disabled, ignore the current
                # request - it will need to be re-requested.
                if self.disable_deletion:
                    Till(till=CSET_DELETION_WAIT_TIME).wait()
                    continue

                with self.working_locker:
                    first_cset = request

                    # Since we are deleting and moving stuff around in the
                    # TUID tables, we need everything to be contained in
                    # one transaction with no interruptions.
                    with self.conn.transaction() as t:
                        revnum = self._get_one_revnum(t, first_cset)[0]
                        csets_to_del = t.get(
                            "SELECT revnum, revision FROM csetLog WHERE revnum <= ?", (revnum,)
                        )
                        csets_to_del = [cset for _, cset in csets_to_del]
                        existing_frontiers = t.query(
                            "SELECT revision FROM latestFileMod WHERE revision IN " +
                            quote_set(csets_to_del)
                        ).data

                        existing_frontiers = [existing_frontiers[i][0] for i, _ in enumerate(existing_frontiers)]
                        Log.note(
                            "Deleting all annotations and changeset log entries with revisions in the list: {{csets}}",
                            csets=csets_to_del
                        )

                        if len(existing_frontiers) > 0:
                            # This handles files which no longer exist anymore in
                            # the main branch.
                            Log.note(
                                "Deleting existing frontiers for revisions: {{revisions}}",
                                revisions=existing_frontiers
                            )
                            t.execute(
                                "DELETE FROM latestFileMod WHERE revision IN " +
                                quote_set(existing_frontiers)
                            )

                        Log.note("Deleting annotations...")
                        t.execute(
                            "DELETE FROM annotations WHERE revision IN " +
                            quote_set(csets_to_del)
                        )

                        Log.note(
                            "Deleting {{num_entries}} csetLog entries...",
                            num_entries=len(csets_to_del)
                        )
                        t.execute(
                            "DELETE FROM csetLog WHERE revision IN " +
                            quote_set(csets_to_del)
                        )

                    # Recalculate the revnums
                    self.recompute_table_revnums()
            except Exception as e:
                Log.warning("Unexpected error occured while deleting from csetLog:", cause=e)
                Till(seconds=CSET_DELETION_WAIT_TIME).wait()
        return


    def get_old_cset_revnum(self, revision):
        self.csets_todo_backwards.add((revision, True))

        revnum = None
        timeout = Till(seconds=BACKFILL_REVNUM_TIMEOUT)
        while not timeout:
            with self.conn.transaction() as t:
                revnum = self._get_one_revnum(t, revision)

            if revnum and revnum[0] >= 0:
                break
            elif revnum[0] < 0:
                Log.note("Waiting for table to recompute...")
            else:
                Log.note("Waiting for backfill to complete...")
            Till(seconds=CSET_BACKFILL_WAIT_TIME).wait()

        if timeout:
            Log.error(
                "Cannot find revision {{rev}} after waiting {{timeout}} seconds",
                rev=revision,
                timeout=BACKFILL_REVNUM_TIMEOUT
            )
        return revnum


    def get_revnnums_from_range(self, revision1, revision2):
        with self.conn.transaction() as t:
            revnum1 = self._get_one_revnum(t, revision1)
            revnum2 = self._get_one_revnum(t, revision2)
        if not revnum1 or not revnum2:
            did_an_update = self.update_tip()
            if did_an_update:
                with self.conn.transaction() as t:
                    revnum1 = self._get_one_revnum(t, revision1)
                    revnum2 = self._get_one_revnum(t, revision2)

            if not revnum1:
                revnum1 = self.get_old_cset_revnum(revision1)
                # Refresh the second entry
                with self.conn.transaction() as t:
                    revnum2 = self._get_one_revnum(t, revision2)

            if not revnum2:
                revnum2 = self.get_old_cset_revnum(revision2)

                # The first revnum might change also
                with self.conn.transaction() as t:
                    revnum1 = self._get_one_revnum(t, revision1)

        with self.conn.transaction() as t:
            result = self._get_revnum_range(t, revnum1[0], revnum2[0])
        return sorted(
            result,
            key=lambda x: int(x[0])
        )
예제 #12
0
파일: cache.py 프로젝트: rv404674/TUID
class Cache(object):
    """
    For Caching hg.mo requests
    """

    @override
    def __init__(self, rate=None, amortization_period=None, source=None, database=None, kwargs=None):
        self.amortization_period = coalesce(amortization_period, AMORTIZATION_PERIOD)
        self.rate = coalesce(rate, HG_REQUEST_PER_SECOND)
        self.cache_locker = Lock()
        self.cache = {}  # MAP FROM url TO (ready, headers, response, timestamp) PAIR
        self.no_cache = {}  # VERY SHORT TERM CACHE
        self.workers = []
        self.todo = Queue(APP_NAME+" todo")
        self.requests = Queue(APP_NAME + " requests", max=int(self.rate * self.amortization_period.seconds))
        self.url = URL(source.url)
        self.db = Sqlite(database)
        self.inbound_rate = RateLogger("Inbound")
        self.outbound_rate = RateLogger("hg.mo")

        if not self.db.query("SELECT name FROM sqlite_master WHERE type='table'").data:
            with self.db.transaction() as t:
                t.execute(
                    "CREATE TABLE cache ("
                    "   path TEXT PRIMARY KEY, "
                    "   headers TEXT, "
                    "   response TEXT, "
                    "   timestamp REAL "
                    ")"
                )

        self.threads = [
            Thread.run(APP_NAME+" worker" + text_type(i), self._worker)
            for i in range(CONCURRENCY)
        ]
        self.limiter = Thread.run(APP_NAME+" limiter", self._rate_limiter)
        self.cleaner = Thread.run(APP_NAME+" cleaner", self._cache_cleaner)

    def _rate_limiter(self, please_stop):
        try:
            max_requests = self.requests.max
            recent_requests = []

            while not please_stop:
                now = Date.now()
                too_old = now - self.amortization_period

                recent_requests = [t for t in recent_requests if t > too_old]

                num_recent = len(recent_requests)
                if num_recent >= max_requests:
                    space_free_at = recent_requests[0] + self.amortization_period
                    (please_stop | Till(till=space_free_at.unix)).wait()
                    continue
                for _ in xrange(num_recent, max_requests):
                    request = self.todo.pop()
                    now = Date.now()
                    recent_requests.append(now)
                    self.requests.add(request)
        except Exception as e:
            Log.warning("failure", cause=e)

    def _cache_cleaner(self, please_stop):
        while not please_stop:
            now = Date.now()
            too_old = now-CACHE_RETENTION

            remove = set()
            with self.cache_locker:
                for path, (ready, headers, response, timestamp) in self.cache:
                    if timestamp < too_old:
                        remove.add(path)
                for r in remove:
                    del self.cache[r]
            (please_stop | Till(seconds=CACHE_RETENTION.seconds / 2)).wait()

    def please_cache(self, path):
        """
        :return: False if `path` is not to be cached
        """
        if path.endswith("/tip"):
            return False
        if any(k in path for k in ["/json-annotate/", "/json-info/", "/json-log/", "/json-rev/", "/rev/", "/raw-rev/", "/raw-file/", "/json-pushes", "/pushloghtml", "/file/"]):
            return True

        return False

    def request(self, method, path, headers):
        now = Date.now()
        self.inbound_rate.add(now)
        ready = Signal(path)

        # TEST CACHE
        with self.cache_locker:
            pair = self.cache.get(path)
            if pair is None:
                self.cache[path] = (ready, None, None, now)


        if pair is not None:
            # REQUEST IS IN THE QUEUE ALREADY, WAIT
            ready, headers, response, then = pair
            if response is None:
                ready.wait()
                with self.cache_locker:
                    ready, headers, response, timestamp = self.cache.get(path)
            with self.db.transaction() as t:
                t.execute("UPDATE cache SET timestamp=" + quote_value(now) + " WHERE path=" + quote_value(path) + " AND timestamp<" + quote_value(now))
            return Response(
                response,
                status=200,
                headers=json.loads(headers)
            )

        # TEST DB
        db_response = self.db.query("SELECT headers, response FROM cache WHERE path=" + quote_value(path)).data
        if db_response:
            headers, response = db_response[0]
            with self.db.transaction() as t:
                t.execute("UPDATE cache SET timestamp=" + quote_value(now) + " WHERE path=" + quote_value(path) + " AND timestamp<" + quote_value(now))
            with self.cache_locker:
                self.cache[path] = (ready, headers, response.encode('latin1'), now)
            ready.go()

            return Response(
                response,
                status=200,
                headers=json.loads(headers)
            )

        # MAKE A NETWORK REQUEST
        self.todo.add((ready, method, path, headers, now))
        ready.wait()
        with self.cache_locker:
            ready, headers, response, timestamp = self.cache[path]
        return Response(
            response,
            status=200,
            headers=json.loads(headers)
        )

    def _worker(self, please_stop):
        while not please_stop:
            pair = self.requests.pop(till=please_stop)
            if please_stop:
                break
            ready, method, path, req_headers, timestamp = pair

            try:
                url = self.url / path
                self.outbound_rate.add(Date.now())
                response = http.request(method, url, req_headers)

                del response.headers['transfer-encoding']
                resp_headers = value2json(response.headers)
                resp_content = response.raw.read()

                please_cache = self.please_cache(path)
                if please_cache:
                    with self.db.transaction() as t:
                        t.execute("INSERT INTO cache (path, headers, response, timestamp) VALUES" + quote_list((path, resp_headers, resp_content.decode('latin1'), timestamp)))
                with self.cache_locker:
                    self.cache[path] = (ready, resp_headers, resp_content, timestamp)
            except Exception as e:
                Log.warning("problem with request to {{path}}", path=path, cause=e)
                with self.cache_locker:
                    ready, headers, response = self.cache[path]
                    del self.cache[path]
            finally:
                ready.go()
예제 #13
0
class Clogger:

    # Singleton of the look-ahead scanner Clogger
    SINGLE_CLOGGER = None

    def __new__(cls, *args, **kwargs):
        if cls.SINGLE_CLOGGER is None:
            cls.SINGLE_CLOGGER = object.__new__(cls)
        return cls.SINGLE_CLOGGER

    def __init__(self,
                 conn=None,
                 tuid_service=None,
                 start_workers=True,
                 new_table=False,
                 kwargs=None):
        try:
            self.config = kwargs
            self.conn = conn if conn else sql.Sql(self.config.database.name)
            self.hg_cache = HgMozillaOrg(
                kwargs=self.config.hg_cache,
                use_cache=True) if self.config.hg_cache else Null

            self.tuid_service = tuid_service if tuid_service else tuid.service.TUIDService(
                kwargs=self.config.tuid, conn=self.conn, clogger=self)
            self.rev_locker = Lock()
            self.working_locker = Lock()

            if new_table:
                with self.conn.transaction() as t:
                    t.execute("DROP TABLE IF EXISTS csetLog")

            self.init_db()
            self.next_revnum = coalesce(
                self.conn.get_one("SELECT max(revnum)+1 FROM csetLog")[0], 1)
            self.csets_todo_backwards = Queue(
                name="Clogger.csets_todo_backwards")
            self.deletions_todo = Queue(name="Clogger.deletions_todo")
            self.maintenance_signal = Signal(name="Clogger.maintenance_signal")

            if 'tuid' in self.config:
                self.config = self.config.tuid

            self.disable_backfilling = False
            self.disable_tipfilling = False
            self.disable_deletion = False
            self.disable_maintenance = False

            self.backfill_thread = None
            self.tipfill_thread = None
            self.deletion_thread = None
            self.maintenance_thread = None

            # Make sure we are filled before allowing queries
            numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0]
            if numrevs < MINIMUM_PERMANENT_CSETS:
                Log.note("Filling in csets to hold {{minim}} csets.",
                         minim=MINIMUM_PERMANENT_CSETS)
                oldest_rev = 'tip'
                with self.conn.transaction() as t:
                    tmp = t.query(
                        "SELECT min(revnum), revision FROM csetLog").data[0][1]
                    if tmp:
                        oldest_rev = tmp
                self._fill_in_range(MINIMUM_PERMANENT_CSETS - numrevs,
                                    oldest_rev,
                                    timestamp=False)

            Log.note("Table is filled with atleast {{minim}} entries.",
                     minim=MINIMUM_PERMANENT_CSETS)

            if start_workers:
                self.start_workers()
        except Exception as e:
            Log.warning("Cannot setup clogger: {{cause}}", cause=str(e))

    def start_backfilling(self):
        if not self.backfill_thread:
            self.backfill_thread = Thread.run('clogger-backfill',
                                              self.fill_backward_with_list)

    def start_tipfillling(self):
        if not self.tipfill_thread:
            self.tipfill_thread = Thread.run('clogger-tip',
                                             self.fill_forward_continuous)

    def start_maintenance(self):
        if not self.maintenance_thread:
            self.maintenance_thread = Thread.run('clogger-maintenance',
                                                 self.csetLog_maintenance)

    def start_deleter(self):
        if not self.deletion_thread:
            self.deletion_thread = Thread.run('clogger-deleter',
                                              self.csetLog_deleter)

    def start_workers(self):
        self.start_tipfillling()
        self.start_backfilling()
        self.start_maintenance()
        self.start_deleter()
        Log.note("Started clogger workers.")

    def init_db(self):
        with self.conn.transaction() as t:
            t.execute('''
            CREATE TABLE IF NOT EXISTS csetLog (
                revnum         INTEGER PRIMARY KEY,
                revision       CHAR(12) NOT NULL,
                timestamp      INTEGER
            );''')

    def disable_all(self):
        self.disable_tipfilling = True
        self.disable_backfilling = True
        self.disable_maintenance = True
        self.disable_deletion = True

    def revnum(self):
        """
        :return: max revnum that was added
        """
        return coalesce(
            self.conn.get_one("SELECT max(revnum) as revnum FROM csetLog")[0],
            0)

    def get_tip(self, transaction):
        return transaction.get_one(
            "SELECT max(revnum) as revnum, revision FROM csetLog")

    def get_tail(self, transaction):
        return transaction.get_one(
            "SELECT min(revnum) as revnum, revision FROM csetLog")

    def _get_clog(self, clog_url):
        try:
            Log.note("Searching through changelog {{url}}", url=clog_url)
            clog_obj = http.get_json(clog_url, retry=RETRY)
            return clog_obj
        except Exception as e:
            Log.error(
                "Unexpected error getting changset-log for {{url}}: {{error}}",
                url=clog_url,
                error=e)

    def _get_one_revision(self, transaction, cset_entry):
        # Returns a single revision if it exists
        _, rev, _ = cset_entry
        return transaction.get_one(
            "SELECT revision FROM csetLog WHERE revision=?", (rev, ))

    def _get_one_revnum(self, transaction, rev):
        # Returns a single revnum if it exists
        return transaction.get_one(
            "SELECT revnum FROM csetLog WHERE revision=?", (rev, ))

    def _get_revnum_range(self, transaction, revnum1, revnum2):
        # Returns a range of revision numbers (that is inclusive)
        high_num = max(revnum1, revnum2)
        low_num = min(revnum1, revnum2)

        return transaction.query("SELECT revnum, revision FROM csetLog WHERE "
                                 "revnum >= " + str(low_num) +
                                 " AND revnum <= " + str(high_num)).data

    def recompute_table_revnums(self):
        '''
        Recomputes the revnums for the csetLog table
        by creating a new table, and copying csetLog to
        it. The INTEGER PRIMARY KEY in the temp table auto increments
        as rows are added.

        IMPORTANT: Only call this after acquiring the
                   lock `self.working_locker`.
        :return:
        '''
        with self.conn.transaction() as t:
            t.execute('''
            CREATE TABLE temp (
                revnum         INTEGER PRIMARY KEY,
                revision       CHAR(12) NOT NULL,
                timestamp      INTEGER
            );''')

            t.execute(
                "INSERT INTO temp (revision, timestamp) "
                "SELECT revision, timestamp FROM csetlog ORDER BY revnum ASC")

            t.execute("DROP TABLE csetLog;")
            t.execute("ALTER TABLE temp RENAME TO csetLog;")

    def check_for_maintenance(self):
        '''
        Returns True if the maintenance worker should be run now,
        and False otherwise.
        :return:
        '''
        numrevs = self.conn.get_one("SELECT count(revnum) FROM csetLog")[0]
        Log.note("Number of csets in csetLog table: {{num}}", num=numrevs)
        if numrevs >= SIGNAL_MAINTENANCE_CSETS:
            return True
        return False

    def add_cset_entries(self,
                         ordered_rev_list,
                         timestamp=False,
                         number_forward=True):
        '''
        Adds a list of revisions to the table. Assumes ordered_rev_list is an ordered
        based on how changesets are found in the changelog. Going forwards or backwards is dealt
        with by flipping the list
        :param ordered_cset_list: Order given from changeset log searching.
        :param timestamp: If false, records are kept indefinitely
                          but if holes exist: (delete, None, delete, None)
                          those delete's with None's around them
                          will not be deleted.
        :param numbered: If True, this function will number the revision list
                         by going forward from max(revNum), else it'll go backwards
                         from revNum, then add X to all revnums and self.next_revnum
                         where X is the length of ordered_rev_list
        :return:
        '''
        with self.conn.transaction() as t:
            current_min = t.get_one("SELECT min(revnum) FROM csetlog")[0]
            current_max = t.get_one("SELECT max(revnum) FROM csetlog")[0]
            if not current_min or not current_max:
                current_min = 0
                current_max = 0

            direction = -1
            start = current_min - 1
            if number_forward:
                direction = 1
                start = current_max + 1
                ordered_rev_list = ordered_rev_list[::-1]

            insert_list = [(start + direction * count, rev,
                            int(time.time()) if timestamp else -1)
                           for count, rev in enumerate(ordered_rev_list)]

            # In case of overlapping requests
            fmt_insert_list = []
            for cset_entry in insert_list:
                tmp = self._get_one_revision(t, cset_entry)
                if not tmp:
                    fmt_insert_list.append(cset_entry)

            for _, tmp_insert_list in jx.groupby(fmt_insert_list,
                                                 size=SQL_CSET_BATCH_SIZE):
                t.execute(
                    "INSERT INTO csetLog (revnum, revision, timestamp)" +
                    " VALUES " + sql_list(
                        quote_set((revnum, revision, timestamp))
                        for revnum, revision, timestamp in tmp_insert_list))

            # Move the revision numbers forward if needed
            self.recompute_table_revnums()

        # Start a maintenance run if needed
        if self.check_for_maintenance():
            Log.note("Scheduling maintenance run on clogger.")
            self.maintenance_signal.go()

    def _fill_in_range(self,
                       parent_cset,
                       child_cset,
                       timestamp=False,
                       number_forward=True):
        '''
        Fills cset logs in a certain range. 'parent_cset' can be an int and in that case,
        we get that many changesets instead. If parent_cset is an int, then we consider
        that we are going backwards (number_forward is False) and we ignore the first
        changeset of the first log, and we ignore the setting for number_forward.
        Otherwise, we continue until we find the given 'parent_cset'.
        :param parent_cset:
        :param child_cset:
        :param timestamp:
        :param number_forward:
        :return:
        '''
        csets_to_add = []
        found_parent = False
        find_parent = False
        if type(parent_cset) != int:
            find_parent = True
        elif parent_cset >= MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG:
            Log.warning(
                "Requested number of new changesets {{num}} is too high. "
                "Max number that can be requested is {{maxnum}}.",
                num=parent_cset,
                maxnum=MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG)
            return None

        csets_found = 0
        clogs_seen = 0
        final_rev = child_cset
        while not found_parent and clogs_seen < MAX_BACKFILL_CLOGS:
            clog_url = str(
                HG_URL
            ) + "/" + self.config.hg.branch + "/json-log/" + final_rev
            clog_obj = self._get_clog(clog_url)
            clog_csets_list = list(clog_obj['changesets'])
            for clog_cset in clog_csets_list[:-1]:
                if not number_forward and csets_found <= 0:
                    # Skip this entry it already exists
                    csets_found += 1
                    continue

                nodes_cset = clog_cset['node'][:12]
                if find_parent:
                    if nodes_cset == parent_cset:
                        found_parent = True
                        if not number_forward:
                            # When going forward this entry is
                            # the given parent
                            csets_to_add.append(nodes_cset)
                        break
                else:
                    if csets_found + 1 > parent_cset:
                        found_parent = True
                        if not number_forward:
                            # When going forward this entry is
                            # the given parent (which is supposed
                            # to already exist)
                            csets_to_add.append(nodes_cset)
                        break
                    csets_found += 1
                csets_to_add.append(nodes_cset)
            if found_parent == True:
                break

            clogs_seen += 1
            final_rev = clog_csets_list[-1]['node'][:12]

        if found_parent:
            self.add_cset_entries(csets_to_add,
                                  timestamp=timestamp,
                                  number_forward=number_forward)
        else:
            Log.warning(
                "Couldn't find the end of the request for {{request}}. "
                "Max number that can be requested through _fill_in_range is {{maxnum}}.",
                request={
                    'parent_cset': parent_cset,
                    'child_cset': child_cset,
                    'number_forward': number_forward
                },
                maxnum=MAX_BACKFILL_CLOGS * CHANGESETS_PER_CLOG)
            return None
        return csets_to_add

    def initialize_to_range(self, old_rev, new_rev, delete_old=True):
        '''
        Used in service testing to get to very old
        changesets quickly.
        :param old_rev: The oldest revision to keep
        :param new_rev: The revision to start searching from
        :return:
        '''
        old_settings = [
            self.disable_tipfilling, self.disable_backfilling,
            self.disable_maintenance, self.disable_deletion
        ]
        self.disable_tipfilling = True
        self.disable_backfilling = True
        self.disable_maintenance = True
        self.disable_deletion = True

        old_rev = old_rev[:12]
        new_rev = new_rev[:12]

        with self.working_locker:
            if delete_old:
                with self.conn.transaction() as t:
                    t.execute("DELETE FROM csetLog")
            with self.conn.transaction() as t:
                t.execute("INSERT INTO csetLog (revision, timestamp) VALUES " +
                          quote_set((new_rev, -1)))
            self._fill_in_range(old_rev,
                                new_rev,
                                timestamp=True,
                                number_forward=False)

        self.disable_tipfilling = old_settings[0]
        self.disable_backfilling = old_settings[1]
        self.disable_maintenance = old_settings[2]
        self.disable_deletion = old_settings[3]

    def fill_backward_with_list(self, please_stop=None):
        '''
        Expects requests of the tuple form: (parent_cset, timestamp)
        parent_cset can be an int X to go back by X changesets, or
        a string to search for going backwards in time. If timestamp
        is false, no timestamps will be added to the entries.
        :param please_stop:
        :return:
        '''
        while not please_stop:
            try:
                request = self.csets_todo_backwards.pop(till=please_stop)
                if please_stop:
                    break

                # If backfilling is disabled, all requests
                # are ignored.
                if self.disable_backfilling:
                    Till(till=CSET_BACKFILL_WAIT_TIME).wait()
                    continue

                if request:
                    parent_cset, timestamp = request
                else:
                    continue

                with self.working_locker:
                    with self.conn.transaction() as t:
                        parent_revnum = self._get_one_revnum(t, parent_cset)
                    if parent_revnum:
                        continue

                    with self.conn.transaction() as t:
                        _, oldest_revision = self.get_tail(t)

                    self._fill_in_range(parent_cset,
                                        oldest_revision,
                                        timestamp=timestamp,
                                        number_forward=False)
                Log.note("Finished {{cset}}", cset=parent_cset)
            except Exception as e:
                Log.warning("Unknown error occurred during backfill: ",
                            cause=e)

    def update_tip(self):
        '''
        Returns False if the tip is already at the newest, or True
        if an update has taken place.
        :return:
        '''
        clog_obj = self._get_clog(
            str(HG_URL) + "/" + self.config.hg.branch + "/json-log/tip")

        # Get current tip in DB
        with self.conn.transaction() as t:
            _, newest_known_rev = self.get_tip(t)

        # If we are still at the newest, wait for CSET_TIP_WAIT_TIME seconds
        # before checking again.
        first_clog_entry = clog_obj['changesets'][0]['node'][:12]
        if newest_known_rev == first_clog_entry:
            return False

        csets_to_gather = None
        if not newest_known_rev:
            Log.note(
                "No revisions found in table, adding {{minim}} entries...",
                minim=MINIMUM_PERMANENT_CSETS)
            csets_to_gather = MINIMUM_PERMANENT_CSETS

        found_newest_known = False
        csets_to_add = []
        csets_found = 0
        clogs_seen = 0
        Log.note("Found new revisions. Updating csetLog tip to {{rev}}...",
                 rev=first_clog_entry)
        while not found_newest_known and clogs_seen < MAX_TIPFILL_CLOGS:
            clog_csets_list = list(clog_obj['changesets'])
            for clog_cset in clog_csets_list[:-1]:
                nodes_cset = clog_cset['node'][:12]
                if not csets_to_gather:
                    if nodes_cset == newest_known_rev:
                        found_newest_known = True
                        break
                else:
                    if csets_found >= csets_to_gather:
                        found_newest_known = True
                        break
                csets_found += 1
                csets_to_add.append(nodes_cset)
            if not found_newest_known:
                # Get the next page
                clogs_seen += 1
                final_rev = clog_csets_list[-1]['node'][:12]
                clog_url = str(
                    HG_URL
                ) + "/" + self.config.hg.branch + "/json-log/" + final_rev
                clog_obj = self._get_clog(clog_url)

        if clogs_seen >= MAX_TIPFILL_CLOGS:
            Log.error(
                "Too many changesets, can't find last tip or the number is too high: {{rev}}. "
                "Maximum possible to request is {{maxnum}}",
                rev=coalesce(newest_known_rev, csets_to_gather),
                maxnum=MAX_TIPFILL_CLOGS * CHANGESETS_PER_CLOG)
            return False

        with self.working_locker:
            Log.note("Adding {{csets}}", csets=csets_to_add)
            self.add_cset_entries(csets_to_add, timestamp=False)
        return True

    def fill_forward_continuous(self, please_stop=None):
        while not please_stop:
            try:
                while not please_stop and not self.disable_tipfilling and self.update_tip(
                ):
                    pass
                (please_stop | Till(seconds=CSET_TIP_WAIT_TIME)).wait()
            except Exception as e:
                Log.warning("Unknown error occurred during tip filling:",
                            cause=e)

    def csetLog_maintenance(self, please_stop=None):
        '''
        Handles deleting old csetLog entries and timestamping
        revisions once they pass the length for permanent
        storage for deletion later.
        :param please_stop:
        :return:
        '''
        while not please_stop:
            try:
                # Wait until something signals the maintenance cycle
                # to begin (or end).
                (self.maintenance_signal | please_stop).wait()

                if please_stop:
                    break
                if self.disable_maintenance:
                    continue

                Log.warning(
                    "Starting clog maintenance. Since this doesn't start often, "
                    "we need to explicitly see when it's started with this warning."
                )

                # Reset signal so we don't request
                # maintenance infinitely.
                with self.maintenance_signal.lock:
                    self.maintenance_signal._go = False

                with self.working_locker:
                    all_data = None
                    with self.conn.transaction() as t:
                        all_data = sorted(t.get(
                            "SELECT revnum, revision, timestamp FROM csetLog"),
                                          key=lambda x: int(x[0]))

                    # Restore maximum permanents (if overflowing)
                    new_data = []
                    modified = False
                    for count, (revnum, revision,
                                timestamp) in enumerate(all_data[::-1]):
                        if count < MINIMUM_PERMANENT_CSETS:
                            if timestamp != -1:
                                modified = True
                                new_data.append((revnum, revision, -1))
                            else:
                                new_data.append((revnum, revision, timestamp))
                        elif type(timestamp) != int or timestamp == -1:
                            modified = True
                            new_data.append(
                                (revnum, revision, int(time.time())))
                        else:
                            new_data.append((revnum, revision, timestamp))

                    # Delete annotations at revisions with timestamps
                    # that are too old. The csetLog entries will have
                    # their timestamps reset here.
                    new_data1 = []
                    annrevs_to_del = []
                    current_time = time.time()
                    for count, (revnum, revision,
                                timestamp) in enumerate(new_data[::-1]):
                        new_timestamp = timestamp
                        if timestamp != -1:
                            if current_time >= timestamp + TIME_TO_KEEP_ANNOTATIONS.seconds:
                                modified = True
                                new_timestamp = current_time
                                annrevs_to_del.append(revision)
                        new_data1.append((revnum, revision, new_timestamp))

                    if len(annrevs_to_del) > 0:
                        # Delete any latestFileMod and annotation entries
                        # that are too old.
                        Log.note(
                            "Deleting annotations and latestFileMod for revisions for being "
                            "older than {{oldest}}: {{revisions}}",
                            oldest=TIME_TO_KEEP_ANNOTATIONS,
                            revisions=annrevs_to_del)
                        with self.conn.transaction() as t:
                            t.execute(
                                "DELETE FROM latestFileMod WHERE revision IN "
                                + quote_set(annrevs_to_del))
                            t.execute(
                                "DELETE FROM annotations WHERE revision IN " +
                                quote_set(annrevs_to_del))

                    # Delete any overflowing entries
                    new_data2 = new_data1
                    reved_all_data = all_data[::-1]
                    deleted_data = reved_all_data[MAXIMUM_NONPERMANENT_CSETS:]
                    delete_overflowing_revstart = None
                    if len(deleted_data) > 0:
                        _, delete_overflowing_revstart, _ = deleted_data[0]
                        new_data2 = set(all_data) - set(deleted_data)

                        # Update old frontiers if requested, otherwise
                        # they will all get deleted by the csetLog_deleter
                        # worker
                        if UPDATE_VERY_OLD_FRONTIERS:
                            _, max_revision, _ = all_data[-1]
                            for _, revision, _ in deleted_data:
                                with self.conn.transaction() as t:
                                    old_files = t.get(
                                        "SELECT file FROM latestFileMod WHERE revision=?",
                                        (revision, ))
                                if old_files is None or len(old_files) <= 0:
                                    continue

                                self.tuid_service.get_tuids_from_files(
                                    old_files,
                                    max_revision,
                                    going_forward=True,
                                )

                                still_exist = True
                                while still_exist and not please_stop:
                                    Till(seconds=TUID_EXISTENCE_WAIT_TIME
                                         ).wait()
                                    with self.conn.transaction() as t:
                                        old_files = t.get(
                                            "SELECT file FROM latestFileMod WHERE revision=?",
                                            (revision, ))
                                    if old_files is None or len(
                                            old_files) <= 0:
                                        still_exist = False

                    # Update table and schedule a deletion
                    if modified:
                        with self.conn.transaction() as t:
                            insert_into_db_chunked(
                                t, new_data2,
                                "INSERT OR REPLACE INTO csetLog (revnum, revision, timestamp) VALUES "
                            )
                    if not deleted_data:
                        continue

                    Log.note("Scheduling {{num_csets}} for deletion",
                             num_csets=len(deleted_data))
                    self.deletions_todo.add(delete_overflowing_revstart)
            except Exception as e:
                Log.warning(
                    "Unexpected error occured while maintaining csetLog, continuing to try: ",
                    cause=e)
        return

    def csetLog_deleter(self, please_stop=None):
        '''
        Deletes changesets from the csetLog table
        and also changesets from the annotation table
        that have revisions matching the given changesets.
        Accepts lists of csets from self.deletions_todo.
        :param please_stop:
        :return:
        '''
        while not please_stop:
            try:
                request = self.deletions_todo.pop(till=please_stop)
                if please_stop:
                    break

                # If deletion is disabled, ignore the current
                # request - it will need to be re-requested.
                if self.disable_deletion:
                    Till(till=CSET_DELETION_WAIT_TIME).wait()
                    continue

                with self.working_locker:
                    first_cset = request

                    # Since we are deleting and moving stuff around in the
                    # TUID tables, we need everything to be contained in
                    # one transaction with no interruptions.
                    with self.conn.transaction() as t:
                        revnum = self._get_one_revnum(t, first_cset)[0]
                        csets_to_del = t.get(
                            "SELECT revnum, revision FROM csetLog WHERE revnum <= ?",
                            (revnum, ))
                        csets_to_del = [cset for _, cset in csets_to_del]
                        existing_frontiers = t.query(
                            "SELECT revision FROM latestFileMod WHERE revision IN "
                            + quote_set(csets_to_del)).data

                        existing_frontiers = [
                            existing_frontiers[i][0]
                            for i, _ in enumerate(existing_frontiers)
                        ]
                        Log.note(
                            "Deleting all annotations and changeset log entries with revisions in the list: {{csets}}",
                            csets=csets_to_del)

                        if len(existing_frontiers) > 0:
                            # This handles files which no longer exist anymore in
                            # the main branch.
                            Log.note(
                                "Deleting existing frontiers for revisions: {{revisions}}",
                                revisions=existing_frontiers)
                            t.execute(
                                "DELETE FROM latestFileMod WHERE revision IN "
                                + quote_set(existing_frontiers))

                        Log.note("Deleting annotations...")
                        t.execute(
                            "DELETE FROM annotations WHERE revision IN " +
                            quote_set(csets_to_del))

                        Log.note("Deleting {{num_entries}} csetLog entries...",
                                 num_entries=len(csets_to_del))
                        t.execute("DELETE FROM csetLog WHERE revision IN " +
                                  quote_set(csets_to_del))

                    # Recalculate the revnums
                    self.recompute_table_revnums()
            except Exception as e:
                Log.warning(
                    "Unexpected error occured while deleting from csetLog:",
                    cause=e)
                Till(seconds=CSET_DELETION_WAIT_TIME).wait()
        return

    def get_old_cset_revnum(self, revision):
        self.csets_todo_backwards.add((revision, True))

        revnum = None
        timeout = Till(seconds=BACKFILL_REVNUM_TIMEOUT)
        while not timeout:
            with self.conn.transaction() as t:
                revnum = self._get_one_revnum(t, revision)

            if revnum and revnum[0] >= 0:
                break
            elif revnum[0] < 0:
                Log.note("Waiting for table to recompute...")
            else:
                Log.note("Waiting for backfill to complete...")
            Till(seconds=CSET_BACKFILL_WAIT_TIME).wait()

        if timeout:
            Log.error(
                "Cannot find revision {{rev}} after waiting {{timeout}} seconds",
                rev=revision,
                timeout=BACKFILL_REVNUM_TIMEOUT)
        return revnum

    def get_revnnums_from_range(self, revision1, revision2):
        with self.conn.transaction() as t:
            revnum1 = self._get_one_revnum(t, revision1)
            revnum2 = self._get_one_revnum(t, revision2)
        if not revnum1 or not revnum2:
            did_an_update = self.update_tip()
            if did_an_update:
                with self.conn.transaction() as t:
                    revnum1 = self._get_one_revnum(t, revision1)
                    revnum2 = self._get_one_revnum(t, revision2)

            if not revnum1:
                revnum1 = self.get_old_cset_revnum(revision1)
                # Refresh the second entry
                with self.conn.transaction() as t:
                    revnum2 = self._get_one_revnum(t, revision2)

            if not revnum2:
                revnum2 = self.get_old_cset_revnum(revision2)

                # The first revnum might change also
                with self.conn.transaction() as t:
                    revnum1 = self._get_one_revnum(t, revision1)

        with self.conn.transaction() as t:
            result = self._get_revnum_range(t, revnum1[0], revnum2[0])
        return sorted(result, key=lambda x: int(x[0]))
예제 #14
0
class FromESMetadata(Schema):
    """
    QUERY THE METADATA
    """

    def __new__(cls, *args, **kwargs):
        global singlton
        if singlton:
            return singlton
        else:
            singlton = object.__new__(cls)
            return singlton

    @override
    def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(kwargs=kwargs)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.last_es_metadata = Date.now()-OLD_METADATA

        self.meta=Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return

    @property
    def query_path(self):
        return None

    @property
    def url(self):
        return self.default_es.path + "/" + self.default_name.replace(".", "/")

    def get_table(self, table_name):
        with self.meta.tables.locker:
            return wrap([t for t in self.meta.tables.data if t.name == table_name])

    def _upsert_column(self, c):
        # ASSUMING THE  self.meta.columns.locker IS HAD
        existing_columns = self.meta.columns.find(c.es_index, c.names["."])
        if not existing_columns:
            self.meta.columns.add(c)
            self.todo.add(c)

            if ENABLE_META_SCAN:
                if DEBUG:
                    Log.note("todo: {{table}}::{{column}}", table=c.es_index, column=c.es_column)
                # MARK meta.columns AS DIRTY TOO
                cols = self.meta.columns.find("meta.columns", None)
                for cc in cols:
                    cc.partitions = cc.cardinality = None
                    cc.last_updated = Date.now()
                self.todo.extend(cols)
        else:
            canonical = existing_columns[0]
            if canonical is not c:
                set_default(c.names, canonical.names)
                for key in Column.__slots__:
                    canonical[key] = c[key]
            if DEBUG:
                Log.note("todo: {{table}}::{{column}}", table=canonical.es_index, column=canonical.es_column)
            self.todo.add(canonical)

    def _get_columns(self, table=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        table_path = split_field(table)
        es_index = table_path[0]
        query_path = join_field(table_path[1:])
        meta = self.es_metadata.indices[es_index]
        if not meta or self.last_es_metadata < Date.now() - OLD_METADATA:
            self.es_metadata = self.default_es.get_metadata(force=True)
            meta = self.es_metadata.indices[es_index]

        for _, properties in meta.mappings.items():
            properties.properties["_id"] = {"type": "string", "index": "not_analyzed"}
            self._parse_properties(meta.index, properties, meta)

    def _parse_properties(self, abs_index, properties, meta):
        # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND
        # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES
        abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties)
        abs_columns = abs_columns.filter(  # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED
            lambda r: not r.es_column.startswith("other.") and
                      not r.es_column.startswith("previous_values.cf_") and
                      not r.es_index.startswith("debug") and
                      r.es_column.find("=") == -1 and
                      r.es_column.find(" ") == -1
        )

        def add_column(c, query_path):
            c.last_updated = Date.now()
            if query_path[0] != ".":
                c.names[query_path[0]] = relative_field(c.names["."], query_path[0])

            with self.meta.columns.locker:
                self._upsert_column(c)
                for alias in meta.aliases:
                    c = copy(c)
                    c.es_index = alias
                    self._upsert_column(c)

        with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(".")
            query_paths.append(ROOT_PATH)

            # ADD RELATIVE COLUMNS
            for abs_column in abs_columns:
                for query_path in query_paths:
                    add_column(abs_column, query_path)

    def query(self, _query):
        return self.meta.columns.query(QueryOp(set_default(
            {
                "from": self.meta.columns,
                "sort": ["table", "name"]
            },
            _query.__data__()
        )))

    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        es_index_name = table_path[0]
        query_path = join_field(table_path[1:])
        table = self.get_table(es_index_name)[0]
        abs_column_name = None if column_name == None else concat_field(query_path, column_name)

        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = Table(
                    name=es_index_name,
                    url=None,
                    query_path=None,
                    timestamp=Date.now()
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=es_index_name)
            elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=es_index_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(es_index_name, column_name)
            if columns:
                columns = jx.sort(columns, "names.\.")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    if DEBUG:
                        Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated])
                    Till(seconds=1).wait()
                return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        if abs_column_name:
            Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name)
        else:
            self._get_columns(table=table_name)  # TO TEST WHAT HAPPENED
            Log.error("no columns for {{table}}?!", table=table_name)

    def _update_cardinality(self, c):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if c.type in STRUCT:
            Log.error("not supported")
        try:
            if c.es_index == "meta.columns":
                with self.meta.columns.locker:
                    partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.columns),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            if c.es_index == "meta.tables":
                with self.meta.columns.locker:
                    partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.tables),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return

            es_index = c.es_index.split(".")[0]
            result = self.default_es.post("/" + es_index + "/_search", data={
                "aggs": {c.names["."]: _counting_query(c)},
                "size": 0
            })
            r = result.aggregations.values()[0]
            count = result.hits.total
            cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count==0 else None)
            if cardinality == None:
                Log.error("logic error")

            query = Data(size=0)
            if cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                if DEBUG:
                    Log.note("{{table}}.{{field}} has {{num}} parts", table=c.es_index, field=c.es_column, num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                if DEBUG:
                    Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            elif len(c.nested_path) != 1:
                query.aggs[literal_field(c.names["."])] = {
                    "nested": {"path": c.nested_path[0]},
                    "aggs": {"_nested": {"terms": {"field": c.es_column, "size": 0}}}
                }
            else:
                query.aggs[literal_field(c.names["."])] = {"terms": {"field": c.es_column, "size": 0}}

            result = self.default_es.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations.values()[0]
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            if DEBUG:
                Log.note("{{field}} has {{parts}}", field=c.name, parts=parts)
            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })
        except Exception as e:
            if "IndexMissingException" in e and c.es_index.startswith(TEST_TABLE_PREFIX):
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": 0,
                            "cardinality": 0,
                            "last_updated": Date.now()
                        },
                        "clear":[
                            "partitions"
                        ],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"names.\\.": ".", "es_index": c.es_index, "es_column": c.es_column}}
                })
                Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=c, cause=e)

    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    with self.meta.columns.locker:
                        old_columns = [
                            c
                            for c in self.meta.columns
                            if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.type not in STRUCT
                        ]
                        if old_columns:
                            if DEBUG:
                                Log.note("Old columns wth dates {{dates|json}}", dates=wrap(old_columns).last_updated)
                            self.todo.extend(old_columns)
                            # TEST CONSISTENCY
                            for c, d in product(list(self.todo.queue), list(self.todo.queue)):
                                if c.es_column == d.es_column and c.es_index == d.es_index and c != d:
                                    Log.error("")
                        else:
                            if DEBUG:
                                Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10*MINUTE).seconds))
                if column:
                    if DEBUG:
                        Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column)
                    if column.type in STRUCT:
                        with self.meta.columns.locker:
                            column.last_updated = Date.now()
                        continue
                    elif column.last_updated >= Date.now()-TOO_OLD:
                        continue
                    try:
                        self._update_cardinality(column)
                        if DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX):
                            Log.note("updated {{column.name}}", column=column)
                    except Exception as e:
                        Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)

    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == THREAD_STOP:
                break

            if not c.last_updated or c.last_updated >= Date.now()-TOO_OLD:
                continue

            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear":[
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })
            if DEBUG:
                Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
예제 #15
0
class ElasticsearchMetadata(Namespace):
    """
    MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER
    """

    @override
    def __new__(cls, kwargs, *args, **_kwargs):
        es_cluster = elasticsearch.Cluster(kwargs)
        output = known_clusters.get(id(es_cluster))
        if output is None:
            output = object.__new__(cls)
            known_clusters[id(es_cluster)] = output
        return output

    @override
    def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)

        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.index_to_alias = {}

        self.es_metadata = Null
        self.metadata_last_updated = Date.now() - OLD_METADATA

        self.meta = Data()
        self.meta.columns = ColumnList(URL(self.es_cluster.settings.host).host)

        self.alias_to_query_paths = {
            "meta.columns": [ROOT_PATH],
            "meta.tables": [ROOT_PATH]
        }
        self.alias_last_updated = {
            "meta.columns": Date.now(),
            "meta.tables": Date.now()
        }
        table_columns = metadata_tables()
        self.meta.tables = ListContainer(
            "meta.tables",
            [],
            jx_base.Schema(".", table_columns)
        )
        self.meta.columns.extend(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("not refresh metadata", self.not_monitor)
        return




    @property
    def namespace(self):
        return self.meta.columns.namespace

    @property
    def url(self):
        return self.es_cluster.url / self.default_name.replace(".", "/")

    def _reload_columns(self, table_desc):
        """
        :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS)
        :return:
        """
        # FIND ALL INDEXES OF ALIAS
        es_last_updated = self.es_cluster.metatdata_last_updated

        alias = table_desc.name
        canonical_index = self.es_cluster.get_best_matching_index(alias).index
        es_metadata_update_required = not (table_desc.timestamp < es_last_updated)
        metadata = self.es_cluster.get_metadata(force=es_metadata_update_required)

        props = [
            (self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties)
            for i, d in metadata.indices.items()
            if alias in d.aliases
            for t, m in [_get_best_type_from_mapping(d.mappings)]
        ]

        # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT
        dirty = False
        all_comparisions = list(jx.pairwise(props)) + list(jx.pairwise(jx.reverse(props)))
        # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE
        for (i1, t1, p1), (i2, t2, p2) in all_comparisions:
            diff = elasticsearch.diff_schema(p2, p1)
            if not self.settings.read_only:
                for d in diff:
                    dirty = True
                    i1.add_property(*d)
        meta = self.es_cluster.get_metadata(force=dirty).indices[canonical_index]

        data_type, mapping = _get_best_type_from_mapping(meta.mappings)
        mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"}
        columns = self._parse_properties(alias, mapping)
        table_desc.timestamp = es_last_updated
        return columns

    def _parse_properties(self, alias, mapping):
        abs_columns = elasticsearch.parse_properties(alias, ".", ROOT_PATH, mapping.properties)
        if DEBUG and any(c.cardinality == 0 and c.name != '_id' for c in abs_columns):
            Log.warning(
                "Some columns are not stored in {{url}} {{index|quote}} table:\n{{names}}",
                url=self.es_cluster.url,
                index=alias,
                names=[
                    ".".join((c.es_index, c.name))
                    for c in abs_columns
                    if c.cardinality == 0
                ]
            )

        with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(".")
            query_paths.append(ROOT_PATH)

            # ENSURE ALL TABLES HAVE THE QUERY PATHS SET
            self.alias_to_query_paths[alias] = query_paths
            for i, a in self.index_to_alias.items():
                if a == alias:
                    self.alias_to_query_paths[i] = query_paths

            # ENSURE COLUMN HAS CORRECT jx_type
            # PICK DEEPEST NESTED PROPERTY AS REPRESENTATIVE
            output = []
            best = {}
            for abs_column in abs_columns:
                abs_column.jx_type = jx_type(abs_column)
                if abs_column.jx_type not in STRUCT:
                    clean_name = unnest_path(abs_column.name)
                    other = best.get(clean_name)
                    if other:
                        if len(other.nested_path) < len(abs_column.nested_path):
                            output.remove(other)
                            self.meta.columns.update({"clear": ".", "where": {"eq": {"es_column": other.es_column, "es_index": other.es_index}}})
                        else:
                            continue
                    best[clean_name] = abs_column
                output.append(abs_column)

            # REGISTER ALL COLUMNS
            canonicals = []
            for abs_column in output:
                canonical = self.meta.columns.add(abs_column)
                canonicals.append(canonical)

            self.todo.extend(canonicals)
            return canonicals

    def query(self, _query):
        return self.meta.columns.query(QueryOp(set_default(
            {
                "from": self.meta.columns,
                "sort": ["table", "name"]
            },
            _query.__data__()
        )))

    def _find_alias(self, name):
        if self.metadata_last_updated < self.es_cluster.metatdata_last_updated:
            for a in self.es_cluster.get_aliases():
                self.index_to_alias[a.index] = coalesce(a.alias, a.index)
                self.alias_last_updated.setdefault(a.alias, Date.MIN)
        if name in self.alias_last_updated:
            return name
        else:
            return self.index_to_alias.get(name)

    def get_columns(self, table_name, column_name=None, after=None, timeout=None):
        """
        RETURN METADATA COLUMNS

        :param table_name: TABLE WE WANT COLUMNS FOR
        :param column_name:  OPTIONAL NAME, IF INTERESTED IN ONLY ONE COLUMN
        :param after: FORCE LOAD, WAITING FOR last_updated TO BE AFTER THIS TIME
        :param timeout: Signal; True when should give up
        :return:
        """
        DEBUG and after and Log.note("getting columns for after {{time}}", time=after)
        table_path = split_field(table_name)
        root_table_name = table_path[0]

        alias = self._find_alias(root_table_name)
        if not alias:
            self.es_cluster.get_metadata(force=True)
            alias = self._find_alias(root_table_name)
            if not alias:
                Log.error("{{table|quote}} does not exist", table=table_name)

        try:
            table = self.get_table(alias)[0]
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = TableDesc(
                    name=alias,
                    url=None,
                    query_path=["."],
                    timestamp=Date.MIN
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                columns = self._reload_columns(table)
                DEBUG and Log.note("columns from reload")
            elif after or table.timestamp < self.es_cluster.metatdata_last_updated:
                columns = self._reload_columns(table)
                DEBUG and Log.note("columns from reload")
            else:
                columns = self.meta.columns.find(alias, column_name)
                DEBUG and Log.note("columns from find()")

            DEBUG and Log.note("columns are {{ids}}", ids=[id(c) for c in columns])

            columns = jx.sort(columns, "name")

            if after is None:
                return columns  # DO NOT WAIT FOR COMPLETE COLUMNS

            # WAIT FOR THE COLUMNS TO UPDATE
            while True:
                pending = [c for c in columns if after >= c.last_updated or (c.cardinality == None and c.jx_type not in STRUCT)]
                if not pending:
                    break
                if timeout:
                    Log.error("trying to gets columns timed out")
                if DEBUG:
                    if len(pending) > 10:
                        Log.note("waiting for {{num}} columns to update by {{timestamp}}", num=len(pending), timestamp=after)
                    else:
                        Log.note("waiting for columns to update by {{timestamp}}; {{columns|json}}", timestamp=after, columns=[c.es_index + "." + c.es_column + " id="+text_type(id(c)) for c in pending])
                Till(seconds=1).wait()
            return columns
        except Exception as e:
            Log.error("Failure to get columns for {{table}}", table=table_name, cause=e)

        return []

    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        now = Date.now()
        if column.es_index in self.index_does_not_exist:
            return

        if column.jx_type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.columns),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            if column.es_index == "meta.tables":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.tables),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": {"filter": {"match_all": {}}}
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = max(1001, count)
                multi = 1001
            elif column.es_column == "_id":
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "query": {"match_all": {}},
                    "size": 0
                })
                count = cardinality = result.hits.total
                multi = 1
            elif column.es_type == BOOLEAN:
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": _counting_query(column)
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = 2

                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": [False, True],
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            else:
                es_query = {
                    "aggs": {
                        "count": _counting_query(column),
                        "_filter": {
                            "aggs": {"multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}}},
                            "filter": {"bool": {"should": [
                                {"range": {"etl.timestamp.~n~": {"gte": (Date.today() - WEEK)}}},
                                {"bool": {"must_not": {"exists": {"field": "etl.timestamp.~n~"}}}}
                            ]}}
                        }
                    },
                    "size": 0
                }

                result = self.es_cluster.post("/" + es_index + "/_search", data=es_query)
                agg_results = result.aggregations
                count = result.hits.total
                cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count)
                multi = int(coalesce(agg_results._filter.multi.value, 1))
                if cardinality == None:
                    Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                self.meta.columns.update({
                    "set": {
                        "count": cardinality,
                        "cardinality": cardinality,
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {"path": column.nested_path[0]},
                    "aggs": {"_nested": {"terms": {"field": column.es_column}}}
                }
            elif cardinality == 0:  # WHEN DOES THIS HAPPEN?
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}}

            result = self.es_cluster.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            DEBUG and Log.note("update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}", id=id(column), column=column, time=now)
            self.meta.columns.update({
                "set": {
                    "count": count,
                    "cardinality": cardinality,
                    "multi": multi,
                    "partitions": parts,
                    "last_updated": now
                },
                "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
            })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            e = Except.wrap(e)
            TEST_TABLE = "testdata"
            is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"])
            is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE))
            if is_missing_index:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                Log.warning("Missing index {{col.es_index}}", col=column, cause=e)
                self.meta.columns.update({
                    "clear": ".",
                    "where": {"eq": {"es_index": column.es_index}}
                })
                self.index_does_not_exist.add(column.es_index)
            elif "No field found for" in e:
                self.meta.columns.update({
                    "clear": ".",
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                Log.warning("Could not get column {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": now
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)

    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    old_columns = [
                        c
                        for c in self.meta.columns
                        if ((c.last_updated < Date.now() - MAX_COLUMN_METADATA_AGE) or c.cardinality == None) and c.jx_type not in STRUCT
                    ]
                    if old_columns:
                        DEBUG and Log.note(
                            "Old columns {{names|json}} last updated {{dates|json}}",
                            names=wrap(old_columns).es_column,
                            dates=[Date(t).format() for t in wrap(old_columns).last_updated]
                        )
                        self.todo.extend(old_columns)
                    else:
                        DEBUG and Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10*MINUTE).seconds))
                if column:
                    if column is THREAD_STOP:
                        continue

                    with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG):
                        if column.es_index in self.index_does_not_exist:
                            DEBUG and Log.note("{{column.es_column}} does not exist", column=column)
                            self.meta.columns.update({
                                "clear": ".",
                                "where": {"eq": {"es_index": column.es_index}}
                            })
                            continue
                        if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE:
                            DEBUG and Log.note("{{column.es_column}} is a struct", column=column)
                            column.last_updated = Date.now()
                            continue
                        elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None:
                            # DO NOT UPDATE FRESH COLUMN METADATA
                            DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds)
                            continue
                        try:
                            self._update_cardinality(column)
                            (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column)
                        except Exception as e:
                            if '"status":404' in e:
                                self.meta.columns.update({
                                    "clear": ".",
                                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                                })
                            else:
                                Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)

    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            column = self.todo.pop()
            if column == THREAD_STOP:
                break

            if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE:
                DEBUG and Log.note("{{column.es_column}} is a struct", column=column)
                column.last_updated = Date.now()
                continue
            elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None:
                # DO NOT UPDATE FRESH COLUMN METADATA
                DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds)
                continue

            with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05):
                if untype_path(column.name) in ["build.type", "run.type"]:
                    try:
                        self._update_cardinality(column)
                    except Exception as e:
                        Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
                else:
                    column.last_updated = Date.now()


    def get_table(self, name):
        if name == "meta.columns":
            return self.meta.columns

        with self.meta.tables.locker:
            return wrap([t for t in self.meta.tables.data if t.name == name])

    def get_snowflake(self, fact_table_name):
        return Snowflake(fact_table_name, self)

    def get_schema(self, name):
        if name == "meta.columns":
            return self.meta.columns.schema
        if name == "meta.tables":
            return self.meta.tables
        root, rest = tail_field(name)
        return self.get_snowflake(root).get_schema(rest)
예제 #16
0
class ElasticsearchMetadata(Namespace):
    """
    MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER
    """
    def __new__(cls, *args, **kwargs):
        if jx_base_meta.singlton:
            return jx_base_meta.singlton
        else:
            jx_base_meta.singlton = object.__new__(cls)
            return jx_base_meta.singlton

    @override
    def __init__(self,
                 host,
                 index,
                 sql_file='metadata.sqlite',
                 alias=None,
                 name=None,
                 port=9200,
                 kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)
        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.index_to_alias = Relation_usingList()

        self.es_metadata = Null
        self.metadata_last_updated = Date.now() - OLD_METADATA

        self.meta = Data()
        self.meta.columns = ColumnList()

        self.alias_to_query_paths = {
            "meta.columns": [['.']],
            "meta.tables": [['.']]
        }
        self.alias_last_updated = {
            "meta.columns": Date.now(),
            "meta.tables": Date.now()
        }
        table_columns = metadata_tables()
        self.meta.tables = ListContainer("meta.tables", [],
                                         jx_base.Schema(".", table_columns))
        self.meta.columns.extend(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return

    @property
    def url(self):
        return self.es_cluster.path + "/" + self.default_name.replace(".", "/")

    def _reload_columns(self, table_desc):
        """
        :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS)
        :return:
        """
        # FIND ALL INDEXES OF ALIAS
        es_last_updated = self.es_cluster.metatdata_last_updated

        alias = table_desc.name
        canonical_index = self.es_cluster.get_best_matching_index(alias).index
        update_required = not (table_desc.timestamp < es_last_updated)
        metadata = self.es_cluster.get_metadata(force=update_required)

        indexes = self.index_to_alias.get_domain(alias)
        props = [(self.es_cluster.get_index(index=i, type=t,
                                            debug=DEBUG), t, m.properties)
                 for i, d in metadata.indices.items() if i in indexes
                 for t, m in [_get_best_type_from_mapping(d.mappings)]]

        # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT
        dirty = False
        all_comparisions = list(jx.pairwise(props)) + list(
            jx.pairwise(jx.reverse(props)))
        # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE
        for (i1, t1, p1), (i2, t2, p2) in all_comparisions:
            diff = elasticsearch.diff_schema(p2, p1)
            if not self.settings.read_only:
                for d in diff:
                    dirty = True
                    i1.add_property(*d)
        meta = self.es_cluster.get_metadata(
            force=dirty).indices[canonical_index]

        data_type, mapping = _get_best_type_from_mapping(meta.mappings)
        mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"}
        self._parse_properties(alias, mapping, meta)
        table_desc.timestamp = es_last_updated

    def _parse_properties(self, alias, mapping, meta):
        abs_columns = elasticsearch.parse_properties(alias, None,
                                                     mapping.properties)
        with Timer("upserting {{num}} columns", {"num": len(abs_columns)},
                   debug=DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns
                           if c.es_type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(SELF_PATH)
            query_paths.append(ROOT_PATH)
            self.alias_to_query_paths[alias] = query_paths

            # ADD RELATIVE NAMES
            for abs_column in abs_columns:
                abs_column.last_updated = None
                abs_column.jx_type = es_type_to_json_type[abs_column.es_type]
                for query_path in query_paths:
                    abs_column.names[query_path[0]] = relative_field(
                        abs_column.names["."], query_path[0])
                self.todo.add(self.meta.columns.add(abs_column))
        pass

    def query(self, _query):
        return self.meta.columns.query(
            QueryOp(
                set_default(
                    {
                        "from": self.meta.columns,
                        "sort": ["table", "name"]
                    }, _query.__data__())))

    def _find_alias(self, name):
        if self.metadata_last_updated < self.es_cluster.metatdata_last_updated:
            for a in self.es_cluster.get_aliases():
                self.index_to_alias[a.index] = coalesce(a.alias, a.index)
                self.alias_last_updated.setdefault(a.alias, Date.MIN)
        if name in self.alias_last_updated:
            return name
        else:
            return self.index_to_alias[name]

    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        root_table_name = table_path[0]

        alias = self._find_alias(root_table_name)
        if not alias:
            self.es_cluster.get_metadata(force=True)
            alias = self._find_alias(root_table_name)
            if not alias:
                Log.error("{{table|quote}} does not exist", table=table_name)

        try:
            last_update = MAX([
                self.es_cluster.index_last_updated[i]
                for i in self.index_to_alias.get_domain(alias)
            ])

            table = self.get_table(alias)[0]
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = TableDesc(name=alias,
                                  url=None,
                                  query_path=['.'],
                                  timestamp=Date.MIN)
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._reload_columns(table)
            elif force or table.timestamp < last_update:
                self._reload_columns(table)

            columns = self.meta.columns.find(alias, column_name)
            columns = jx.sort(columns, "names.\\.")
            # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
            while len(self.todo) and not all(columns.get("last_updated")):
                if DEBUG:
                    if len(columns) > 10:
                        Log.note("waiting for {{num}} columns to update",
                                 num=len([
                                     c for c in columns if not c.last_updated
                                 ]))
                    else:
                        Log.note(
                            "waiting for columns to update {{columns|json}}",
                            columns=[
                                c.es_index + "." + c.es_column for c in columns
                                if not c.last_updated
                            ])
                Till(seconds=1).wait()
            return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        return []

    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if column.es_index in self.index_does_not_exist:
            return

        if column.jx_type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                partitions = jx.sort([
                    g[column.es_column]
                    for g, _ in jx.groupby(self.meta.columns, column.es_column)
                    if g[column.es_column] != None
                ])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.columns),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            if column.es_index == "meta.tables":
                partitions = jx.sort([
                    g[column.es_column]
                    for g, _ in jx.groupby(self.meta.tables, column.es_column)
                    if g[column.es_column] != None
                ])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.tables),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [
                cc for cc in self.meta.columns
                if cc.es_column == column.es_column and cc.es_type == "text"
            ]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count": {
                                                          "filter": {
                                                              "match_all": {}
                                                          }
                                                      }
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = 1001
                multi = 1001
            elif column.es_column == "_id":
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "query": {
                                                      "match_all": {}
                                                  },
                                                  "size": 0
                                              })
                count = cardinality = result.hits.total
                multi = 1
            elif column.es_type == BOOLEAN:
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count":
                                                      _counting_query(column)
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = 2
                multi = 1
            else:
                result = self.es_cluster.post(
                    "/" + es_index + "/_search",
                    data={
                        "aggs": {
                            "count": _counting_query(column),
                            "multi": {
                                "max": {
                                    "script":
                                    "doc[" + quote(column.es_column) +
                                    "].values.size()"
                                }
                            }
                        },
                        "size": 0
                    })
                agg_results = result.aggregations
                count = result.hits.total
                cardinality = coalesce(agg_results.count.value,
                                       agg_results.count._nested.value,
                                       agg_results.count.doc_count)
                multi = int(coalesce(agg_results.multi.value, 1))
                if cardinality == None:
                    Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                self.meta.columns.update({
                    "set": {
                        "count": cardinality,
                        "cardinality": cardinality,
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count
                                        ) or (count >= 1000
                                              and cardinality / count > 0.99):
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts",
                                   table=column.es_index,
                                   field=column.es_column,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                DEBUG and Log.note("{{field}} has {{num}} parts",
                                   field=column.es_index,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {
                        "path": column.nested_path[0]
                    },
                    "aggs": {
                        "_nested": {
                            "terms": {
                                "field": column.es_column
                            }
                        }
                    }
                }
            elif cardinality == 0:
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {
                    "terms": {
                        "field": column.es_column,
                        "size": cardinality
                    }
                }

            result = self.es_cluster.post("/" + es_index + "/_search",
                                          data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            self.meta.columns.update({
                "set": {
                    "count": count,
                    "cardinality": cardinality,
                    "multi": multi,
                    "partitions": parts,
                    "last_updated": Date.now()
                },
                "where": {
                    "eq": {
                        "es_index": column.es_index,
                        "es_column": column.es_column
                    }
                }
            })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            TEST_TABLE = "testdata"
            is_missing_index = any(
                w in e for w in
                ["IndexMissingException", "index_not_found_exception"])
            is_test_table = any(
                column.es_index.startswith(t)
                for t in [TEST_TABLE_PREFIX, TEST_TABLE])
            if is_missing_index and is_test_table:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                self.meta.columns.update({
                    "clear": ".",
                    "where": {
                        "eq": {
                            "es_index": column.es_index
                        }
                    }
                })
                self.index_does_not_exist.add(column.es_index)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "names.\\.": ".",
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get {{col.es_index}}.{{col.es_column}} info",
                    col=column,
                    cause=e)

    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    old_columns = [
                        c for c in self.meta.columns
                        if (c.last_updated == None or c.last_updated <
                            Date.now() - TOO_OLD) and c.jx_type not in STRUCT
                    ]
                    if old_columns:
                        DEBUG and Log.note(
                            "Old columns {{names|json}} last updated {{dates|json}}",
                            names=wrap(old_columns).es_column,
                            dates=[
                                Date(t).format()
                                for t in wrap(old_columns).last_updated
                            ])
                        self.todo.extend(old_columns)
                        # TEST CONSISTENCY
                        for c, d in product(list(self.todo.queue),
                                            list(self.todo.queue)):
                            if c.es_column == d.es_column and c.es_index == d.es_index and c != d:
                                Log.error("")
                    else:
                        DEBUG and Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds))
                if column:
                    if column is THREAD_STOP:
                        continue

                    DEBUG and Log.note("update {{table}}.{{column}}",
                                       table=column.es_index,
                                       column=column.es_column)
                    if column.es_index in self.index_does_not_exist:
                        self.meta.columns.update({
                            "clear": ".",
                            "where": {
                                "eq": {
                                    "es_index": column.es_index
                                }
                            }
                        })
                        continue
                    if column.jx_type in STRUCT or column.es_column.endswith(
                            "." + EXISTS_TYPE):
                        column.last_updated = Date.now()
                        continue
                    elif column.last_updated >= Date.now() - TOO_OLD:
                        continue
                    try:
                        self._update_cardinality(column)
                        (DEBUG
                         and not column.es_index.startswith(TEST_TABLE_PREFIX)
                         ) and Log.note("updated {{column.name}}",
                                        column=column)
                    except Exception as e:
                        Log.warning(
                            "problem getting cardinality for {{column.name}}",
                            column=column,
                            cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)

    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == THREAD_STOP:
                break

            if c.last_updated >= Date.now() - TOO_OLD:
                continue

            self.meta.columns.update({
                "set": {
                    "last_updated": Date.now()
                },
                "clear": [
                    "count",
                    "cardinality",
                    "multi",
                    "partitions",
                ],
                "where": {
                    "eq": {
                        "es_index": c.es_index,
                        "es_column": c.es_column
                    }
                }
            })
            DEBUG and Log.note(
                "Did not get {{col.es_index}}.{{col.es_column}} info", col=c)

    def get_table(self, alias_name):
        with self.meta.tables.locker:
            return wrap(
                [t for t in self.meta.tables.data if t.name == alias_name])

    def get_snowflake(self, fact_table_name):
        return Snowflake(fact_table_name, self)

    def get_schema(self, name):
        if name == "meta.columns":
            return self.meta.columns.schema
        query_path = split_field(name)
        root, rest = query_path[0], join_field(query_path[1:])
        return self.get_snowflake(root).get_schema(rest)
예제 #17
0
파일: cache.py 프로젝트: mars-f/ActiveData
class Cache(object):
    """
    For Caching hg.mo requests
    """

    @override
    def __init__(self, rate=None, amortization_period=None, source=None, database=None, kwargs=None):
        self.amortization_period = coalesce(amortization_period, AMORTIZATION_PERIOD)
        self.rate = coalesce(rate, HG_REQUEST_PER_SECOND)
        self.cache_locker = Lock()
        self.cache = {}  # MAP FROM url TO (ready, headers, response, timestamp) PAIR
        self.no_cache = {}  # VERY SHORT TERM CACHE
        self.workers = []
        self.todo = Queue(APP_NAME+" todo")
        self.requests = Queue(APP_NAME + " requests", max=int(self.rate * self.amortization_period.seconds))
        self.url = URL(source.url)
        self.db = Sqlite(database)
        self.inbound_rate = RateLogger("Inbound")
        self.outbound_rate = RateLogger("hg.mo")

        if not self.db.query("SELECT name FROM sqlite_master WHERE type='table'").data:
            with self.db.transaction() as t:
                t.execute(
                    "CREATE TABLE cache ("
                    "   path TEXT PRIMARY KEY, "
                    "   headers TEXT, "
                    "   response TEXT, "
                    "   timestamp REAL "
                    ")"
                )

        self.threads = [
            Thread.run(APP_NAME+" worker" + text_type(i), self._worker)
            for i in range(CONCURRENCY)
        ]
        self.limiter = Thread.run(APP_NAME+" limiter", self._rate_limiter)
        self.cleaner = Thread.run(APP_NAME+" cleaner", self._cache_cleaner)

    def _rate_limiter(self, please_stop):
        try:
            max_requests = self.requests.max
            recent_requests = []

            while not please_stop:
                now = Date.now()
                too_old = now - self.amortization_period

                recent_requests = [t for t in recent_requests if t > too_old]

                num_recent = len(recent_requests)
                if num_recent >= max_requests:
                    space_free_at = recent_requests[0] + self.amortization_period
                    (please_stop | Till(till=space_free_at.unix)).wait()
                    continue
                for _ in xrange(num_recent, max_requests):
                    request = self.todo.pop()
                    now = Date.now()
                    recent_requests.append(now)
                    self.requests.add(request)
        except Exception as e:
            Log.warning("failure", cause=e)

    def _cache_cleaner(self, please_stop):
        while not please_stop:
            now = Date.now()
            too_old = now-CACHE_RETENTION

            remove = set()
            with self.cache_locker:
                for path, (ready, headers, response, timestamp) in self.cache:
                    if timestamp < too_old:
                        remove.add(path)
                for r in remove:
                    del self.cache[r]
            (please_stop | Till(seconds=CACHE_RETENTION.seconds / 2)).wait()

    def please_cache(self, path):
        """
        :return: False if `path` is not to be cached
        """
        if path.endswith("/tip"):
            return False
        if any(k in path for k in ["/json-annotate/", "/json-info/", "/json-log/", "/json-rev/", "/rev/", "/raw-rev/", "/raw-file/", "/json-pushes", "/pushloghtml", "/file/"]):
            return True

        return False

    def request(self, method, path, headers):
        now = Date.now()
        self.inbound_rate.add(now)
        ready = Signal(path)

        # TEST CACHE
        with self.cache_locker:
            pair = self.cache.get(path)
            if pair is None:
                self.cache[path] = (ready, None, None, now)


        if pair is not None:
            # REQUEST IS IN THE QUEUE ALREADY, WAIT
            ready, headers, response, then = pair
            if response is None:
                ready.wait()
                with self.cache_locker:
                    ready, headers, response, timestamp = self.cache.get(path)
            with self.db.transaction() as t:
                t.execute("UPDATE cache SET timestamp=" + quote_value(now) + " WHERE path=" + quote_value(path) + " AND timestamp<" + quote_value(now))
            return Response(
                response,
                status=200,
                headers=json.loads(headers)
            )

        # TEST DB
        db_response = self.db.query("SELECT headers, response FROM cache WHERE path=" + quote_value(path)).data
        if db_response:
            headers, response = db_response[0]
            with self.db.transaction() as t:
                t.execute("UPDATE cache SET timestamp=" + quote_value(now) + " WHERE path=" + quote_value(path) + " AND timestamp<" + quote_value(now))
            with self.cache_locker:
                self.cache[path] = (ready, headers, response.encode('latin1'), now)
            ready.go()

            return Response(
                response,
                status=200,
                headers=json.loads(headers)
            )

        # MAKE A NETWORK REQUEST
        self.todo.add((ready, method, path, headers, now))
        ready.wait()
        with self.cache_locker:
            ready, headers, response, timestamp = self.cache[path]
        return Response(
            response,
            status=200,
            headers=json.loads(headers)
        )

    def _worker(self, please_stop):
        while not please_stop:
            pair = self.requests.pop(till=please_stop)
            if please_stop:
                break
            ready, method, path, req_headers, timestamp = pair

            try:
                url = self.url / path
                self.outbound_rate.add(Date.now())
                response = http.request(method, url, req_headers)

                del response.headers['transfer-encoding']
                resp_headers = value2json(response.headers)
                resp_content = response.raw.read()

                please_cache = self.please_cache(path)
                if please_cache:
                    with self.db.transaction() as t:
                        t.execute("INSERT INTO cache (path, headers, response, timestamp) VALUES" + quote_list((path, resp_headers, resp_content.decode('latin1'), timestamp)))
                with self.cache_locker:
                    self.cache[path] = (ready, resp_headers, resp_content, timestamp)
            except Exception as e:
                Log.warning("problem with request to {{path}}", path=path, cause=e)
                with self.cache_locker:
                    ready, headers, response = self.cache[path]
                    del self.cache[path]
            finally:
                ready.go()
예제 #18
0
class FromESMetadata(Schema):
    """
    QUERY THE METADATA
    """
    def __new__(cls, *args, **kwargs):
        if jx_base_meta.singlton:
            return jx_base_meta.singlton
        else:
            jx_base_meta.singlton = object.__new__(cls)
            return jx_base_meta.singlton

    @override
    def __init__(self,
                 host,
                 index,
                 sql_file='metadata.sqlite',
                 alias=None,
                 name=None,
                 port=9200,
                 kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = elasticsearch.Cluster(kwargs=kwargs)
        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.abs_columns = set()
        self.last_es_metadata = Date.now() - OLD_METADATA

        self.meta = Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer(
            "meta.tables", [], wrap({c.names["."]: c
                                     for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return

    @property
    def query_path(self):
        return None

    @property
    def url(self):
        return self.default_es.path + "/" + self.default_name.replace(".", "/")

    def get_table(self, table_name):
        with self.meta.tables.locker:
            return wrap(
                [t for t in self.meta.tables.data if t.name == table_name])

    def _upsert_column(self, c):
        # ASSUMING THE  self.meta.columns.locker IS HAD
        existing_columns = self.meta.columns.find(c.es_index, c.names["."])
        for canonical in existing_columns:
            if canonical.type == c.type and canonical is not c:
                set_default(c.names, canonical.names)
                for key in Column.__slots__:
                    canonical[key] = c[key]
                if DEBUG:
                    Log.note("todo: {{table}}::{{column}}",
                             table=canonical.es_index,
                             column=canonical.es_column)
                self.todo.add(canonical)
                break
        else:
            self.meta.columns.add(c)
            self.todo.add(c)

            if ENABLE_META_SCAN:
                if DEBUG:
                    Log.note("todo: {{table}}::{{column}}",
                             table=c.es_index,
                             column=c.es_column)
                # MARK meta.columns AS DIRTY TOO
                cols = self.meta.columns.find("meta.columns", None)
                for cc in cols:
                    cc.partitions = cc.cardinality = None
                    cc.last_updated = Date.now() - TOO_OLD
                self.todo.extend(cols)

    def _get_columns(self, table=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        table_path = split_field(table)
        es_index = table_path[0]
        meta = self.es_metadata.indices[es_index]
        if not meta or self.last_es_metadata < Date.now() - OLD_METADATA:
            self.es_metadata = self.default_es.get_metadata(force=True)
            meta = self.es_metadata.indices[es_index]

        for data_type, properties in meta.mappings.items():
            if data_type == "_default_":
                continue
            properties.properties["_id"] = {
                "type": "string",
                "index": "not_analyzed"
            }
            self._parse_properties(meta.index, properties, meta)

    def _parse_properties(self, abs_index, properties, meta):
        # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND
        # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES
        def add_column(c, query_path):
            c.last_updated = Date.now() - TOO_OLD
            if query_path[0] != ".":
                c.names[query_path[0]] = relative_field(
                    c.names["."], query_path[0])

            with self.meta.columns.locker:
                for alias in meta.aliases:
                    c_ = copy(c)
                    c_.es_index = alias
                    self._upsert_column(c_)
                self._upsert_column(c)

        abs_columns = elasticsearch.parse_properties(abs_index, None,
                                                     properties.properties)
        self.abs_columns.update(abs_columns)
        with Timer("upserting {{num}} columns", {"num": len(abs_columns)},
                   debug=DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns
                           if c.type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(".")
            query_paths.append(SELF_PATH)

            # ADD RELATIVE COLUMNS
            for abs_column in abs_columns:
                abs_column = abs_column.__copy__()
                abs_column.type = es_type_to_json_type[abs_column.type]
                for query_path in query_paths:
                    add_column(abs_column, query_path)
        pass

    def query(self, _query):
        return self.meta.columns.query(
            QueryOp(
                set_default(
                    {
                        "from": self.meta.columns,
                        "sort": ["table", "name"]
                    }, _query.__data__())))

    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        es_index_name = table_path[0]
        query_path = join_field(table_path[1:])
        table = self.get_table(es_index_name)[0]
        abs_column_name = None if column_name == None else concat_field(
            query_path, column_name)

        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = Table(name=es_index_name,
                              url=None,
                              query_path=['.'],
                              timestamp=Date.now())
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=es_index_name)
            elif force or table.timestamp == None or table.timestamp < Date.now(
            ) - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=es_index_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(es_index_name, column_name)
            if columns:
                columns = jx.sort(columns, "names.\.")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    if DEBUG:
                        Log.note(
                            "waiting for columns to update {{columns|json}}",
                            columns=[
                                c.es_index + "." + c.es_column for c in columns
                                if not c.last_updated
                            ])
                    Till(seconds=1).wait()
                return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        if abs_column_name:
            Log.error("no columns matching {{table}}.{{column}}",
                      table=table_name,
                      column=abs_column_name)
        else:
            self._get_columns(table=table_name)  # TO TEST WHAT HAPPENED
            Log.error("no columns for {{table}}?!", table=table_name)

    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if column.es_index in self.index_does_not_exist:
            return

        if column.type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                with self.meta.columns.locker:
                    partitions = jx.sort([
                        g[column.es_column] for g, _ in jx.groupby(
                            self.meta.columns, column.es_column)
                        if g[column.es_column] != None
                    ])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.columns),
                            "cardinality": len(partitions),
                            "multi": 1,
                            "last_updated": Date.now()
                        },
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return
            if column.es_index == "meta.tables":
                with self.meta.columns.locker:
                    partitions = jx.sort([
                        g[column.es_column] for g, _ in jx.groupby(
                            self.meta.tables, column.es_column)
                        if g[column.es_column] != None
                    ])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.tables),
                            "cardinality": len(partitions),
                            "multi": 1,
                            "last_updated": Date.now()
                        },
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [
                cc for cc in self.abs_columns
                if cc.es_column == column.es_column and cc.type == "text"
            ]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.default_es.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count": {
                                                          "filter": {
                                                              "match_all": {}
                                                          }
                                                      }
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = 1001
                multi = 1001
            elif column.es_column == "_id":
                result = self.default_es.post("/" + es_index + "/_search",
                                              data={
                                                  "query": {
                                                      "match_all": {}
                                                  },
                                                  "size": 0
                                              })
                count = cardinality = result.hits.total
                multi = 1
            else:
                result = self.default_es.post(
                    "/" + es_index + "/_search",
                    data={
                        "aggs": {
                            "count": _counting_query(column),
                            "multi": {
                                "max": {
                                    "script":
                                    "doc[" + quote(column.es_column) +
                                    "].values.size()"
                                }
                            }
                        },
                        "size": 0
                    })
                r = result.aggregations.count
                count = result.hits.total
                cardinality = coalesce(r.value, r._nested.value, r.doc_count)
                multi = coalesce(r.multi.value, 1)
                if cardinality == None:
                    Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": cardinality,
                            "cardinality": cardinality,
                            "multi": 1,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count
                                        ) or (count >= 1000
                                              and cardinality / count > 0.99):
                if DEBUG:
                    Log.note("{{table}}.{{field}} has {{num}} parts",
                             table=column.es_index,
                             field=column.es_column,
                             num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "multi": multi,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return
            elif column.type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                if DEBUG:
                    Log.note("{{field}} has {{num}} parts",
                             field=column.es_index,
                             num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "multi": multi,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {
                        "path": column.nested_path[0]
                    },
                    "aggs": {
                        "_nested": {
                            "terms": {
                                "field": column.es_column
                            }
                        }
                    }
                }
            elif cardinality == 0:
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {
                    "terms": {
                        "field": column.es_column,
                        "size": cardinality
                    }
                }

            result = self.default_es.post("/" + es_index + "/_search",
                                          data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            if DEBUG:
                Log.note("{{field}} has {{parts}}",
                         field=column.names["."],
                         parts=parts)
            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            TEST_TABLE = "testdata"
            is_missing_index = any(
                w in e for w in
                ["IndexMissingException", "index_not_found_exception"])
            is_test_table = any(
                column.es_index.startswith(t)
                for t in [TEST_TABLE_PREFIX, TEST_TABLE])
            if is_missing_index and is_test_table:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "clear": ".",
                        "where": {
                            "eq": {
                                "es_index": column.es_index
                            }
                        }
                    })
                self.index_does_not_exist.add(column.es_index)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "names.\\.": ".",
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get {{col.es_index}}.{{col.es_column}} info",
                    col=column,
                    cause=e)

    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    with self.meta.columns.locker:
                        old_columns = [
                            c for c in self.meta.columns
                            if (c.last_updated == None or c.last_updated <
                                Date.now() - TOO_OLD) and c.type not in STRUCT
                        ]
                        if old_columns:
                            if DEBUG:
                                Log.note(
                                    "Old columns {{names|json}} last updated {{dates|json}}",
                                    names=wrap(old_columns).es_column,
                                    dates=[
                                        Date(t).format()
                                        for t in wrap(old_columns).last_updated
                                    ])
                            self.todo.extend(old_columns)
                            # TEST CONSISTENCY
                            for c, d in product(list(self.todo.queue),
                                                list(self.todo.queue)):
                                if c.es_column == d.es_column and c.es_index == d.es_index and c != d:
                                    Log.error("")
                        else:
                            if DEBUG:
                                Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds))
                if DEBUG:
                    Log.note("update {{table}}.{{column}}",
                             table=column.es_index,
                             column=column.es_column)
                if column:
                    if column.es_index in self.index_does_not_exist:
                        with self.meta.columns.locker:
                            self.meta.columns.update({
                                "clear": ".",
                                "where": {
                                    "eq": {
                                        "es_index": column.es_index
                                    }
                                }
                            })
                        continue
                    if column.type in STRUCT or column.es_column.endswith(
                            "." + EXISTS_TYPE):
                        with self.meta.columns.locker:
                            column.last_updated = Date.now()
                        continue
                    elif column.last_updated >= Date.now() - TOO_OLD:
                        continue
                    try:
                        self._update_cardinality(column)
                        if DEBUG and not column.es_index.startswith(
                                TEST_TABLE_PREFIX):
                            Log.note("updated {{column.name}}", column=column)
                    except Exception as e:
                        Log.warning(
                            "problem getting cardinality for {{column.name}}",
                            column=column,
                            cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)

    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == THREAD_STOP:
                break

            if not c.last_updated or c.last_updated >= Date.now() - TOO_OLD:
                continue

            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "es_index": c.es_index,
                            "es_column": c.es_column
                        }
                    }
                })
            if DEBUG:
                Log.note(
                    "Could not get {{col.es_index}}.{{col.es_column}} info",
                    col=c)
예제 #19
0
class ElasticsearchMetadata(Namespace):
    """
    MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER
    """
    @override
    def __new__(cls, kwargs, *args, **_kwargs):
        es_cluster = elasticsearch.Cluster(kwargs)
        output = known_clusters.get(id(es_cluster))
        if output is None:
            output = object.__new__(cls)
            known_clusters[id(es_cluster)] = output
        return output

    @override
    def __init__(self,
                 host,
                 index,
                 sql_file='metadata.sqlite',
                 alias=None,
                 name=None,
                 port=9200,
                 kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)

        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.index_to_alias = {}

        self.es_metadata = Null
        self.metadata_last_updated = Date.now() - OLD_METADATA

        self.meta = Data()
        self.meta.columns = ColumnList(URL(self.es_cluster.settings.host).host)

        self.alias_to_query_paths = {
            "meta.columns": [ROOT_PATH],
            "meta.tables": [ROOT_PATH]
        }
        self.alias_last_updated = {
            "meta.columns": Date.now(),
            "meta.tables": Date.now()
        }
        table_columns = metadata_tables()
        self.meta.tables = ListContainer("meta.tables", [],
                                         jx_base.Schema(".", table_columns))
        self.meta.columns.extend(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("not refresh metadata", self.not_monitor)
        return

    @property
    def namespace(self):
        return self.meta.columns.namespace

    @property
    def url(self):
        return self.es_cluster.url / self.default_name.replace(".", "/")

    def _reload_columns(self, table_desc):
        """
        :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS)
        :return:
        """
        # FIND ALL INDEXES OF ALIAS
        es_last_updated = self.es_cluster.metatdata_last_updated

        alias = table_desc.name
        canonical_index = self.es_cluster.get_best_matching_index(alias).index
        es_metadata_update_required = not (table_desc.timestamp <
                                           es_last_updated)
        metadata = self.es_cluster.get_metadata(
            force=es_metadata_update_required)

        props = [(self.es_cluster.get_index(index=i, type=t,
                                            debug=DEBUG), t, m.properties)
                 for i, d in metadata.indices.items() if alias in d.aliases
                 for t, m in [_get_best_type_from_mapping(d.mappings)]]

        # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT
        dirty = False
        all_comparisions = list(jx.pairwise(props)) + list(
            jx.pairwise(jx.reverse(props)))
        # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE
        for (i1, t1, p1), (i2, t2, p2) in all_comparisions:
            diff = elasticsearch.diff_schema(p2, p1)
            if not self.settings.read_only:
                for d in diff:
                    dirty = True
                    i1.add_property(*d)
        meta = self.es_cluster.get_metadata(
            force=dirty).indices[canonical_index]

        data_type, mapping = _get_best_type_from_mapping(meta.mappings)
        mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"}
        columns = self._parse_properties(alias, mapping)
        table_desc.timestamp = es_last_updated
        return columns

    def _parse_properties(self, alias, mapping):
        abs_columns = elasticsearch.parse_properties(alias, ".", ROOT_PATH,
                                                     mapping.properties)
        if DEBUG and any(c.cardinality == 0 and c.name != '_id'
                         for c in abs_columns):
            Log.warning(
                "Some columns are not stored in {{url}} {{index|quote}} table:\n{{names}}",
                url=self.es_cluster.url,
                index=alias,
                names=[
                    ".".join((c.es_index, c.name)) for c in abs_columns
                    if c.cardinality == 0
                ])

        with Timer("upserting {{num}} columns", {"num": len(abs_columns)},
                   silent=not DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns
                           if c.es_type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(".")
            query_paths.append(ROOT_PATH)

            # ENSURE ALL TABLES HAVE THE QUERY PATHS SET
            self.alias_to_query_paths[alias] = query_paths
            for i, a in self.index_to_alias.items():
                if a == alias:
                    self.alias_to_query_paths[i] = query_paths

            # ENSURE COLUMN HAS CORRECT jx_type
            # PICK DEEPEST NESTED PROPERTY AS REPRESENTATIVE
            output = []
            best = {}
            for abs_column in abs_columns:
                abs_column.jx_type = jx_type(abs_column)
                if abs_column.jx_type not in STRUCT:
                    clean_name = unnest_path(abs_column.name)
                    other = best.get(clean_name)
                    if other:
                        if len(other.nested_path) < len(
                                abs_column.nested_path):
                            output.remove(other)
                            self.meta.columns.update({
                                "clear": ".",
                                "where": {
                                    "eq": {
                                        "es_column": other.es_column,
                                        "es_index": other.es_index
                                    }
                                }
                            })
                        else:
                            continue
                    best[clean_name] = abs_column
                output.append(abs_column)

            # REGISTER ALL COLUMNS
            canonicals = []
            for abs_column in output:
                canonical = self.meta.columns.add(abs_column)
                canonicals.append(canonical)

            self.todo.extend(canonicals)
            return canonicals

    def query(self, _query):
        return self.meta.columns.query(
            QueryOp(
                set_default(
                    {
                        "from": self.meta.columns,
                        "sort": ["table", "name"]
                    }, _query.__data__())))

    def _find_alias(self, name):
        if self.metadata_last_updated < self.es_cluster.metatdata_last_updated:
            for a in self.es_cluster.get_aliases():
                self.index_to_alias[a.index] = coalesce(a.alias, a.index)
                self.alias_last_updated.setdefault(a.alias, Date.MIN)
        if name in self.alias_last_updated:
            return name
        else:
            return self.index_to_alias.get(name)

    def get_columns(self,
                    table_name,
                    column_name=None,
                    after=None,
                    timeout=None):
        """
        RETURN METADATA COLUMNS

        :param table_name: TABLE WE WANT COLUMNS FOR
        :param column_name:  OPTIONAL NAME, IF INTERESTED IN ONLY ONE COLUMN
        :param after: FORCE LOAD, WAITING FOR last_updated TO BE AFTER THIS TIME
        :param timeout: Signal; True when should give up
        :return:
        """
        DEBUG and after and Log.note("getting columns for after {{time}}",
                                     time=after)
        table_path = split_field(table_name)
        root_table_name = table_path[0]

        alias = self._find_alias(root_table_name)
        if not alias:
            self.es_cluster.get_metadata(force=True)
            alias = self._find_alias(root_table_name)
            if not alias:
                Log.error("{{table|quote}} does not exist", table=table_name)

        try:
            table = self.get_table(alias)[0]
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = TableDesc(name=alias,
                                  url=None,
                                  query_path=["."],
                                  timestamp=Date.MIN)
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                columns = self._reload_columns(table)
                DEBUG and Log.note("columns from reload")
            elif after or table.timestamp < self.es_cluster.metatdata_last_updated:
                columns = self._reload_columns(table)
                DEBUG and Log.note("columns from reload")
            else:
                columns = self.meta.columns.find(alias, column_name)
                DEBUG and Log.note("columns from find()")

            DEBUG and Log.note("columns are {{ids}}",
                               ids=[id(c) for c in columns])

            columns = jx.sort(columns, "name")

            if after is None:
                return columns  # DO NOT WAIT FOR COMPLETE COLUMNS

            # WAIT FOR THE COLUMNS TO UPDATE
            while True:
                pending = [
                    c for c in columns if after >= c.last_updated or (
                        c.cardinality == None and c.jx_type not in STRUCT)
                ]
                if not pending:
                    break
                if timeout:
                    Log.error("trying to gets columns timed out")
                if DEBUG:
                    if len(pending) > 10:
                        Log.note(
                            "waiting for {{num}} columns to update by {{timestamp}}",
                            num=len(pending),
                            timestamp=after)
                    else:
                        Log.note(
                            "waiting for columns to update by {{timestamp}}; {{columns|json}}",
                            timestamp=after,
                            columns=[
                                c.es_index + "." + c.es_column + " id=" +
                                text_type(id(c)) for c in pending
                            ])
                Till(seconds=1).wait()
            return columns
        except Exception as e:
            Log.error("Failure to get columns for {{table}}",
                      table=table_name,
                      cause=e)

        return []

    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        now = Date.now()
        if column.es_index in self.index_does_not_exist:
            return

        if column.jx_type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                partitions = jx.sort([
                    g[column.es_column]
                    for g, _ in jx.groupby(self.meta.columns, column.es_column)
                    if g[column.es_column] != None
                ])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.columns),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            if column.es_index == "meta.tables":
                partitions = jx.sort([
                    g[column.es_column]
                    for g, _ in jx.groupby(self.meta.tables, column.es_column)
                    if g[column.es_column] != None
                ])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.tables),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [
                cc for cc in self.meta.columns
                if cc.es_column == column.es_column and cc.es_type == "text"
            ]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count": {
                                                          "filter": {
                                                              "match_all": {}
                                                          }
                                                      }
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = max(1001, count)
                multi = 1001
            elif column.es_column == "_id":
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "query": {
                                                      "match_all": {}
                                                  },
                                                  "size": 0
                                              })
                count = cardinality = result.hits.total
                multi = 1
            elif column.es_type == BOOLEAN:
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count":
                                                      _counting_query(column)
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = 2

                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts",
                                   table=column.es_index,
                                   field=column.es_column,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": [False, True],
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            else:
                es_query = {
                    "aggs": {
                        "count": _counting_query(column),
                        "_filter": {
                            "aggs": {
                                "multi": {
                                    "max": {
                                        "script":
                                        "doc[" + quote(column.es_column) +
                                        "].values.size()"
                                    }
                                }
                            },
                            "filter": {
                                "bool": {
                                    "should": [{
                                        "range": {
                                            "etl.timestamp.~n~": {
                                                "gte": (Date.today() - WEEK)
                                            }
                                        }
                                    }, {
                                        "bool": {
                                            "must_not": {
                                                "exists": {
                                                    "field":
                                                    "etl.timestamp.~n~"
                                                }
                                            }
                                        }
                                    }]
                                }
                            }
                        }
                    },
                    "size": 0
                }

                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data=es_query)
                agg_results = result.aggregations
                count = result.hits.total
                cardinality = coalesce(agg_results.count.value,
                                       agg_results.count._nested.value,
                                       agg_results.count.doc_count)
                multi = int(coalesce(agg_results._filter.multi.value, 1))
                if cardinality == None:
                    Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                self.meta.columns.update({
                    "set": {
                        "count": cardinality,
                        "cardinality": cardinality,
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count
                                        ) or (count >= 1000
                                              and cardinality / count > 0.99):
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts",
                                   table=column.es_index,
                                   field=column.es_column,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts",
                                   table=column.es_index,
                                   field=column.es_column,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {
                        "path": column.nested_path[0]
                    },
                    "aggs": {
                        "_nested": {
                            "terms": {
                                "field": column.es_column
                            }
                        }
                    }
                }
            elif cardinality == 0:  # WHEN DOES THIS HAPPEN?
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {
                    "terms": {
                        "field": column.es_column,
                        "size": cardinality
                    }
                }

            result = self.es_cluster.post("/" + es_index + "/_search",
                                          data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            DEBUG and Log.note(
                "update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}",
                id=id(column),
                column=column,
                time=now)
            self.meta.columns.update({
                "set": {
                    "count": count,
                    "cardinality": cardinality,
                    "multi": multi,
                    "partitions": parts,
                    "last_updated": now
                },
                "where": {
                    "eq": {
                        "es_index": column.es_index,
                        "es_column": column.es_column
                    }
                }
            })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            e = Except.wrap(e)
            TEST_TABLE = "testdata"
            is_missing_index = any(
                w in e for w in
                ["IndexMissingException", "index_not_found_exception"])
            is_test_table = column.es_index.startswith(
                (TEST_TABLE_PREFIX, TEST_TABLE))
            if is_missing_index:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                Log.warning("Missing index {{col.es_index}}",
                            col=column,
                            cause=e)
                self.meta.columns.update({
                    "clear": ".",
                    "where": {
                        "eq": {
                            "es_index": column.es_index
                        }
                    }
                })
                self.index_does_not_exist.add(column.es_index)
            elif "No field found for" in e:
                self.meta.columns.update({
                    "clear": ".",
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get column {{col.es_index}}.{{col.es_column}} info",
                    col=column,
                    cause=e)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": now
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get {{col.es_index}}.{{col.es_column}} info",
                    col=column,
                    cause=e)

    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    old_columns = [
                        c for c in self.meta.columns
                        if ((c.last_updated < Date.now() -
                             MAX_COLUMN_METADATA_AGE) or c.cardinality == None)
                        and c.jx_type not in STRUCT
                    ]
                    if old_columns:
                        DEBUG and Log.note(
                            "Old columns {{names|json}} last updated {{dates|json}}",
                            names=wrap(old_columns).es_column,
                            dates=[
                                Date(t).format()
                                for t in wrap(old_columns).last_updated
                            ])
                        self.todo.extend(old_columns)
                    else:
                        DEBUG and Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds))
                if column:
                    if column is THREAD_STOP:
                        continue

                    with Timer("update {{table}}.{{column}}",
                               param={
                                   "table": column.es_index,
                                   "column": column.es_column
                               },
                               silent=not DEBUG):
                        if column.es_index in self.index_does_not_exist:
                            DEBUG and Log.note(
                                "{{column.es_column}} does not exist",
                                column=column)
                            self.meta.columns.update({
                                "clear": ".",
                                "where": {
                                    "eq": {
                                        "es_index": column.es_index
                                    }
                                }
                            })
                            continue
                        if column.jx_type in STRUCT or split_field(
                                column.es_column)[-1] == EXISTS_TYPE:
                            DEBUG and Log.note(
                                "{{column.es_column}} is a struct",
                                column=column)
                            column.last_updated = Date.now()
                            continue
                        elif column.last_updated > Date.now(
                        ) - TOO_OLD and column.cardinality is not None:
                            # DO NOT UPDATE FRESH COLUMN METADATA
                            DEBUG and Log.note(
                                "{{column.es_column}} is still fresh ({{ago}} ago)",
                                column=column,
                                ago=(Date.now() -
                                     Date(column.last_updated)).seconds)
                            continue
                        try:
                            self._update_cardinality(column)
                            (DEBUG and
                             not column.es_index.startswith(TEST_TABLE_PREFIX)
                             ) and Log.note("updated {{column.name}}",
                                            column=column)
                        except Exception as e:
                            if '"status":404' in e:
                                self.meta.columns.update({
                                    "clear": ".",
                                    "where": {
                                        "eq": {
                                            "es_index": column.es_index,
                                            "es_column": column.es_column
                                        }
                                    }
                                })
                            else:
                                Log.warning(
                                    "problem getting cardinality for {{column.name}}",
                                    column=column,
                                    cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)

    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            column = self.todo.pop()
            if column == THREAD_STOP:
                break

            if column.jx_type in STRUCT or split_field(
                    column.es_column)[-1] == EXISTS_TYPE:
                DEBUG and Log.note("{{column.es_column}} is a struct",
                                   column=column)
                column.last_updated = Date.now()
                continue
            elif column.last_updated > Date.now(
            ) - TOO_OLD and column.cardinality is not None:
                # DO NOT UPDATE FRESH COLUMN METADATA
                DEBUG and Log.note(
                    "{{column.es_column}} is still fresh ({{ago}} ago)",
                    column=column,
                    ago=(Date.now() - Date(column.last_updated)).seconds)
                continue

            with Timer("Update {{col.es_index}}.{{col.es_column}}",
                       param={"col": column},
                       silent=not DEBUG,
                       too_long=0.05):
                if untype_path(column.name) in ["build.type", "run.type"]:
                    try:
                        self._update_cardinality(column)
                    except Exception as e:
                        Log.warning(
                            "problem getting cardinality for {{column.name}}",
                            column=column,
                            cause=e)
                else:
                    column.last_updated = Date.now()

    def get_table(self, name):
        if name == "meta.columns":
            return self.meta.columns

        with self.meta.tables.locker:
            return wrap([t for t in self.meta.tables.data if t.name == name])

    def get_snowflake(self, fact_table_name):
        return Snowflake(fact_table_name, self)

    def get_schema(self, name):
        if name == "meta.columns":
            return self.meta.columns.schema
        if name == "meta.tables":
            return self.meta.tables
        root, rest = tail_field(name)
        return self.get_snowflake(root).get_schema(rest)