Exemplo n.º 1
0
    def _find_revision(self, revision):
        please_stop = False
        locker = Lock()
        output = []
        queue = Queue("branches", max=2000)
        queue.extend(b for b in self.branches if b.locale == DEFAULT_LOCALE and b.name in ["try", "mozilla-inbound", "autoland"])
        queue.add(THREAD_STOP)

        problems = []
        def _find(please_stop):
            for b in queue:
                if please_stop:
                    return
                try:
                    url = b.url + "json-info?node=" + revision
                    rev = self.get_revision(Revision(branch=b, changeset={"id": revision}))
                    with locker:
                        output.append(rev)
                    Log.note("Revision found at {{url}}", url=url)
                except Exception as f:
                    problems.append(f)

        threads = []
        for i in range(3):
            threads.append(Thread.run("find changeset " + text_type(i), _find, please_stop=please_stop))

        for t in threads:
            with assert_no_exception:
                t.join()

        return output
Exemplo n.º 2
0
    def _find_revision(self, revision):
        please_stop = False
        locker = Lock()
        output = []
        queue = Queue("branches", max=2000)
        queue.extend(b for b in self.branches if b.locale == DEFAULT_LOCALE and b.name in ["try", "mozilla-inbound", "autoland"])
        queue.add(THREAD_STOP)

        problems = []
        def _find(please_stop):
            for b in queue:
                if please_stop:
                    return
                try:
                    url = b.url + "json-info?node=" + revision
                    rev = self.get_revision(Revision(branch=b, changeset={"id": revision}))
                    with locker:
                        output.append(rev)
                    Log.note("Revision found at {{url}}", url=url)
                except Exception as f:
                    problems.append(f)

        threads = []
        for i in range(3):
            threads.append(Thread.run("find changeset " + text_type(i), _find, please_stop=please_stop))

        for t in threads:
            with assert_no_exception:
                t.join()

        return output
Exemplo n.º 3
0
def update_local_database(config, deviant_summary, candidates, since):
    if isinstance(deviant_summary, bigquery.Table):
        Log.note("Only the ETL process should fill the bigquery table")
        return

    # GET EVERYTHING WE HAVE SO FAR
    exists = deviant_summary.query({
        "select": ["signature_hash", "last_updated"],
        "where": {
            "and": [
                {
                    "in": {
                        "signature_hash": candidates.signature_hash
                    }
                },
                {
                    "exists": "num_pushes"
                },
            ]
        },
        "sort": "last_updated",
        "limit": 100000,
        "format": "list",
    }).data
    # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT"
    missing = list(set(candidates.signature_hash) - set(exists.signature_hash))

    too_old = Date.today() - parse(LOCAL_RETENTION)
    needs_update = missing + [
        e.signature_hash for e in exists if e.last_updated < too_old.unix
    ]
    Log.alert("{{num}} series are candidates for local update",
              num=len(needs_update))

    limited_update = Queue("sigs")
    limited_update.extend(
        left(needs_update, coalesce(config.display.download_limit, 100)))
    Log.alert("Updating local database with {{num}} series",
              num=len(limited_update))

    with Timer("Updating local database"):

        def loop(please_stop):
            while not please_stop:
                signature_hash = limited_update.pop_one()
                if not signature_hash:
                    return
                process(
                    signature_hash,
                    since,
                    source=config.database,
                    deviant_summary=deviant_summary,
                )

        threads = [Thread.run(text(i), loop) for i in range(3)]
        for t in threads:
            t.join()

    Log.note("Local database is up to date")
Exemplo n.º 4
0
def update_local_database():
    # GET EVERYTHING WE HAVE SO FAR
    exists = summary_table.query({
        "select": ["id", "last_updated"],
        "where": {
            "and": [{
                "in": {
                    "id": candidates.id
                }
            }, {
                "exists": "num_pushes"
            }]
        },
        "sort": "last_updated",
        "limit": 100000,
        "format": "list",
    }).data
    # CHOOSE MISSING, THEN OLDEST, UP TO "RECENT"
    missing = list(set(candidates.id) - set(exists.id))

    too_old = Date.today() - parse(LOCAL_RETENTION)
    needs_update = missing + [
        e for e in exists if e.last_updated < too_old.unix
    ]
    Log.alert("{{num}} series are candidates for local update",
              num=len(needs_update))

    limited_update = Queue("sigs")
    limited_update.extend(
        left(needs_update, coalesce(config.analysis.download_limit, 100)))
    Log.alert("Updating local database with {{num}} series",
              num=len(limited_update))

    with Timer("Updating local database"):

        def loop(please_stop):
            while not please_stop:
                sig_id = limited_update.pop_one()
                if not sig_id:
                    return
                process(sig_id)

        threads = [Thread.run(text(i), loop) for i in range(3)]
        for t in threads:
            t.join()

    Log.note("Local database is up to date")
Exemplo n.º 5
0
class FromESMetadata(Schema):
    """
    QUERY THE METADATA
    """
    def __new__(cls, *args, **kwargs):
        if jx_base_meta.singlton:
            return jx_base_meta.singlton
        else:
            jx_base_meta.singlton = object.__new__(cls)
            return jx_base_meta.singlton

    @override
    def __init__(self,
                 host,
                 index,
                 sql_file='metadata.sqlite',
                 alias=None,
                 name=None,
                 port=9200,
                 kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = elasticsearch.Cluster(kwargs=kwargs)
        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.abs_columns = set()
        self.last_es_metadata = Date.now() - OLD_METADATA

        self.meta = Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer(
            "meta.tables", [], wrap({c.names["."]: c
                                     for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return

    @property
    def query_path(self):
        return None

    @property
    def url(self):
        return self.default_es.path + "/" + self.default_name.replace(".", "/")

    def get_table(self, table_name):
        with self.meta.tables.locker:
            return wrap(
                [t for t in self.meta.tables.data if t.name == table_name])

    def _upsert_column(self, c):
        # ASSUMING THE  self.meta.columns.locker IS HAD
        existing_columns = self.meta.columns.find(c.es_index, c.names["."])
        for canonical in existing_columns:
            if canonical.type == c.type and canonical is not c:
                set_default(c.names, canonical.names)
                for key in Column.__slots__:
                    canonical[key] = c[key]
                if DEBUG:
                    Log.note("todo: {{table}}::{{column}}",
                             table=canonical.es_index,
                             column=canonical.es_column)
                self.todo.add(canonical)
                break
        else:
            self.meta.columns.add(c)
            self.todo.add(c)

            if ENABLE_META_SCAN:
                if DEBUG:
                    Log.note("todo: {{table}}::{{column}}",
                             table=c.es_index,
                             column=c.es_column)
                # MARK meta.columns AS DIRTY TOO
                cols = self.meta.columns.find("meta.columns", None)
                for cc in cols:
                    cc.partitions = cc.cardinality = None
                    cc.last_updated = Date.now() - TOO_OLD
                self.todo.extend(cols)

    def _get_columns(self, table=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        table_path = split_field(table)
        es_index = table_path[0]
        meta = self.es_metadata.indices[es_index]
        if not meta or self.last_es_metadata < Date.now() - OLD_METADATA:
            self.es_metadata = self.default_es.get_metadata(force=True)
            meta = self.es_metadata.indices[es_index]

        for data_type, properties in meta.mappings.items():
            if data_type == "_default_":
                continue
            properties.properties["_id"] = {
                "type": "string",
                "index": "not_analyzed"
            }
            self._parse_properties(meta.index, properties, meta)

    def _parse_properties(self, abs_index, properties, meta):
        # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND
        # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES
        def add_column(c, query_path):
            c.last_updated = Date.now() - TOO_OLD
            if query_path[0] != ".":
                c.names[query_path[0]] = relative_field(
                    c.names["."], query_path[0])

            with self.meta.columns.locker:
                for alias in meta.aliases:
                    c_ = copy(c)
                    c_.es_index = alias
                    self._upsert_column(c_)
                self._upsert_column(c)

        abs_columns = elasticsearch.parse_properties(abs_index, None,
                                                     properties.properties)
        self.abs_columns.update(abs_columns)
        with Timer("upserting {{num}} columns", {"num": len(abs_columns)},
                   debug=DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns
                           if c.type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(".")
            query_paths.append(SELF_PATH)

            # ADD RELATIVE COLUMNS
            for abs_column in abs_columns:
                abs_column = abs_column.__copy__()
                abs_column.type = es_type_to_json_type[abs_column.type]
                for query_path in query_paths:
                    add_column(abs_column, query_path)
        pass

    def query(self, _query):
        return self.meta.columns.query(
            QueryOp(
                set_default(
                    {
                        "from": self.meta.columns,
                        "sort": ["table", "name"]
                    }, _query.__data__())))

    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        es_index_name = table_path[0]
        query_path = join_field(table_path[1:])
        table = self.get_table(es_index_name)[0]
        abs_column_name = None if column_name == None else concat_field(
            query_path, column_name)

        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = Table(name=es_index_name,
                              url=None,
                              query_path=['.'],
                              timestamp=Date.now())
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=es_index_name)
            elif force or table.timestamp == None or table.timestamp < Date.now(
            ) - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=es_index_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(es_index_name, column_name)
            if columns:
                columns = jx.sort(columns, "names.\.")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    if DEBUG:
                        Log.note(
                            "waiting for columns to update {{columns|json}}",
                            columns=[
                                c.es_index + "." + c.es_column for c in columns
                                if not c.last_updated
                            ])
                    Till(seconds=1).wait()
                return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        if abs_column_name:
            Log.error("no columns matching {{table}}.{{column}}",
                      table=table_name,
                      column=abs_column_name)
        else:
            self._get_columns(table=table_name)  # TO TEST WHAT HAPPENED
            Log.error("no columns for {{table}}?!", table=table_name)

    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if column.es_index in self.index_does_not_exist:
            return

        if column.type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                with self.meta.columns.locker:
                    partitions = jx.sort([
                        g[column.es_column] for g, _ in jx.groupby(
                            self.meta.columns, column.es_column)
                        if g[column.es_column] != None
                    ])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.columns),
                            "cardinality": len(partitions),
                            "multi": 1,
                            "last_updated": Date.now()
                        },
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return
            if column.es_index == "meta.tables":
                with self.meta.columns.locker:
                    partitions = jx.sort([
                        g[column.es_column] for g, _ in jx.groupby(
                            self.meta.tables, column.es_column)
                        if g[column.es_column] != None
                    ])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.tables),
                            "cardinality": len(partitions),
                            "multi": 1,
                            "last_updated": Date.now()
                        },
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [
                cc for cc in self.abs_columns
                if cc.es_column == column.es_column and cc.type == "text"
            ]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.default_es.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count": {
                                                          "filter": {
                                                              "match_all": {}
                                                          }
                                                      }
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = 1001
                multi = 1001
            elif column.es_column == "_id":
                result = self.default_es.post("/" + es_index + "/_search",
                                              data={
                                                  "query": {
                                                      "match_all": {}
                                                  },
                                                  "size": 0
                                              })
                count = cardinality = result.hits.total
                multi = 1
            else:
                result = self.default_es.post(
                    "/" + es_index + "/_search",
                    data={
                        "aggs": {
                            "count": _counting_query(column),
                            "multi": {
                                "max": {
                                    "script":
                                    "doc[" + quote(column.es_column) +
                                    "].values.size()"
                                }
                            }
                        },
                        "size": 0
                    })
                r = result.aggregations.count
                count = result.hits.total
                cardinality = coalesce(r.value, r._nested.value, r.doc_count)
                multi = coalesce(r.multi.value, 1)
                if cardinality == None:
                    Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": cardinality,
                            "cardinality": cardinality,
                            "multi": 1,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count
                                        ) or (count >= 1000
                                              and cardinality / count > 0.99):
                if DEBUG:
                    Log.note("{{table}}.{{field}} has {{num}} parts",
                             table=column.es_index,
                             field=column.es_column,
                             num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "multi": multi,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return
            elif column.type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                if DEBUG:
                    Log.note("{{field}} has {{num}} parts",
                             field=column.es_index,
                             num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "multi": multi,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": column.es_index,
                                "es_column": column.es_column
                            }
                        }
                    })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {
                        "path": column.nested_path[0]
                    },
                    "aggs": {
                        "_nested": {
                            "terms": {
                                "field": column.es_column
                            }
                        }
                    }
                }
            elif cardinality == 0:
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {
                    "terms": {
                        "field": column.es_column,
                        "size": cardinality
                    }
                }

            result = self.default_es.post("/" + es_index + "/_search",
                                          data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            if DEBUG:
                Log.note("{{field}} has {{parts}}",
                         field=column.names["."],
                         parts=parts)
            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            TEST_TABLE = "testdata"
            is_missing_index = any(
                w in e for w in
                ["IndexMissingException", "index_not_found_exception"])
            is_test_table = any(
                column.es_index.startswith(t)
                for t in [TEST_TABLE_PREFIX, TEST_TABLE])
            if is_missing_index and is_test_table:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "clear": ".",
                        "where": {
                            "eq": {
                                "es_index": column.es_index
                            }
                        }
                    })
                self.index_does_not_exist.add(column.es_index)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "names.\\.": ".",
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get {{col.es_index}}.{{col.es_column}} info",
                    col=column,
                    cause=e)

    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    with self.meta.columns.locker:
                        old_columns = [
                            c for c in self.meta.columns
                            if (c.last_updated == None or c.last_updated <
                                Date.now() - TOO_OLD) and c.type not in STRUCT
                        ]
                        if old_columns:
                            if DEBUG:
                                Log.note(
                                    "Old columns {{names|json}} last updated {{dates|json}}",
                                    names=wrap(old_columns).es_column,
                                    dates=[
                                        Date(t).format()
                                        for t in wrap(old_columns).last_updated
                                    ])
                            self.todo.extend(old_columns)
                            # TEST CONSISTENCY
                            for c, d in product(list(self.todo.queue),
                                                list(self.todo.queue)):
                                if c.es_column == d.es_column and c.es_index == d.es_index and c != d:
                                    Log.error("")
                        else:
                            if DEBUG:
                                Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds))
                if DEBUG:
                    Log.note("update {{table}}.{{column}}",
                             table=column.es_index,
                             column=column.es_column)
                if column:
                    if column.es_index in self.index_does_not_exist:
                        with self.meta.columns.locker:
                            self.meta.columns.update({
                                "clear": ".",
                                "where": {
                                    "eq": {
                                        "es_index": column.es_index
                                    }
                                }
                            })
                        continue
                    if column.type in STRUCT or column.es_column.endswith(
                            "." + EXISTS_TYPE):
                        with self.meta.columns.locker:
                            column.last_updated = Date.now()
                        continue
                    elif column.last_updated >= Date.now() - TOO_OLD:
                        continue
                    try:
                        self._update_cardinality(column)
                        if DEBUG and not column.es_index.startswith(
                                TEST_TABLE_PREFIX):
                            Log.note("updated {{column.name}}", column=column)
                    except Exception as e:
                        Log.warning(
                            "problem getting cardinality for {{column.name}}",
                            column=column,
                            cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)

    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == THREAD_STOP:
                break

            if not c.last_updated or c.last_updated >= Date.now() - TOO_OLD:
                continue

            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "es_index": c.es_index,
                            "es_column": c.es_column
                        }
                    }
                })
            if DEBUG:
                Log.note(
                    "Could not get {{col.es_index}}.{{col.es_column}} info",
                    col=c)
Exemplo n.º 6
0
class ElasticsearchMetadata(Namespace):
    """
    MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER
    """
    def __new__(cls, *args, **kwargs):
        if jx_base_meta.singlton:
            return jx_base_meta.singlton
        else:
            jx_base_meta.singlton = object.__new__(cls)
            return jx_base_meta.singlton

    @override
    def __init__(self,
                 host,
                 index,
                 sql_file='metadata.sqlite',
                 alias=None,
                 name=None,
                 port=9200,
                 kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)
        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.index_to_alias = Relation_usingList()

        self.es_metadata = Null
        self.metadata_last_updated = Date.now() - OLD_METADATA

        self.meta = Data()
        self.meta.columns = ColumnList()

        self.alias_to_query_paths = {
            "meta.columns": [['.']],
            "meta.tables": [['.']]
        }
        self.alias_last_updated = {
            "meta.columns": Date.now(),
            "meta.tables": Date.now()
        }
        table_columns = metadata_tables()
        self.meta.tables = ListContainer("meta.tables", [],
                                         jx_base.Schema(".", table_columns))
        self.meta.columns.extend(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return

    @property
    def url(self):
        return self.es_cluster.path + "/" + self.default_name.replace(".", "/")

    def _reload_columns(self, table_desc):
        """
        :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS)
        :return:
        """
        # FIND ALL INDEXES OF ALIAS
        es_last_updated = self.es_cluster.metatdata_last_updated

        alias = table_desc.name
        canonical_index = self.es_cluster.get_best_matching_index(alias).index
        update_required = not (table_desc.timestamp < es_last_updated)
        metadata = self.es_cluster.get_metadata(force=update_required)

        indexes = self.index_to_alias.get_domain(alias)
        props = [(self.es_cluster.get_index(index=i, type=t,
                                            debug=DEBUG), t, m.properties)
                 for i, d in metadata.indices.items() if i in indexes
                 for t, m in [_get_best_type_from_mapping(d.mappings)]]

        # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT
        dirty = False
        all_comparisions = list(jx.pairwise(props)) + list(
            jx.pairwise(jx.reverse(props)))
        # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE
        for (i1, t1, p1), (i2, t2, p2) in all_comparisions:
            diff = elasticsearch.diff_schema(p2, p1)
            if not self.settings.read_only:
                for d in diff:
                    dirty = True
                    i1.add_property(*d)
        meta = self.es_cluster.get_metadata(
            force=dirty).indices[canonical_index]

        data_type, mapping = _get_best_type_from_mapping(meta.mappings)
        mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"}
        self._parse_properties(alias, mapping, meta)
        table_desc.timestamp = es_last_updated

    def _parse_properties(self, alias, mapping, meta):
        abs_columns = elasticsearch.parse_properties(alias, None,
                                                     mapping.properties)
        with Timer("upserting {{num}} columns", {"num": len(abs_columns)},
                   debug=DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns
                           if c.es_type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(SELF_PATH)
            query_paths.append(ROOT_PATH)
            self.alias_to_query_paths[alias] = query_paths

            # ADD RELATIVE NAMES
            for abs_column in abs_columns:
                abs_column.last_updated = None
                abs_column.jx_type = es_type_to_json_type[abs_column.es_type]
                for query_path in query_paths:
                    abs_column.names[query_path[0]] = relative_field(
                        abs_column.names["."], query_path[0])
                self.todo.add(self.meta.columns.add(abs_column))
        pass

    def query(self, _query):
        return self.meta.columns.query(
            QueryOp(
                set_default(
                    {
                        "from": self.meta.columns,
                        "sort": ["table", "name"]
                    }, _query.__data__())))

    def _find_alias(self, name):
        if self.metadata_last_updated < self.es_cluster.metatdata_last_updated:
            for a in self.es_cluster.get_aliases():
                self.index_to_alias[a.index] = coalesce(a.alias, a.index)
                self.alias_last_updated.setdefault(a.alias, Date.MIN)
        if name in self.alias_last_updated:
            return name
        else:
            return self.index_to_alias[name]

    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        root_table_name = table_path[0]

        alias = self._find_alias(root_table_name)
        if not alias:
            self.es_cluster.get_metadata(force=True)
            alias = self._find_alias(root_table_name)
            if not alias:
                Log.error("{{table|quote}} does not exist", table=table_name)

        try:
            last_update = MAX([
                self.es_cluster.index_last_updated[i]
                for i in self.index_to_alias.get_domain(alias)
            ])

            table = self.get_table(alias)[0]
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = TableDesc(name=alias,
                                  url=None,
                                  query_path=['.'],
                                  timestamp=Date.MIN)
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._reload_columns(table)
            elif force or table.timestamp < last_update:
                self._reload_columns(table)

            columns = self.meta.columns.find(alias, column_name)
            columns = jx.sort(columns, "names.\\.")
            # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
            while len(self.todo) and not all(columns.get("last_updated")):
                if DEBUG:
                    if len(columns) > 10:
                        Log.note("waiting for {{num}} columns to update",
                                 num=len([
                                     c for c in columns if not c.last_updated
                                 ]))
                    else:
                        Log.note(
                            "waiting for columns to update {{columns|json}}",
                            columns=[
                                c.es_index + "." + c.es_column for c in columns
                                if not c.last_updated
                            ])
                Till(seconds=1).wait()
            return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        return []

    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if column.es_index in self.index_does_not_exist:
            return

        if column.jx_type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                partitions = jx.sort([
                    g[column.es_column]
                    for g, _ in jx.groupby(self.meta.columns, column.es_column)
                    if g[column.es_column] != None
                ])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.columns),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            if column.es_index == "meta.tables":
                partitions = jx.sort([
                    g[column.es_column]
                    for g, _ in jx.groupby(self.meta.tables, column.es_column)
                    if g[column.es_column] != None
                ])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.tables),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [
                cc for cc in self.meta.columns
                if cc.es_column == column.es_column and cc.es_type == "text"
            ]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count": {
                                                          "filter": {
                                                              "match_all": {}
                                                          }
                                                      }
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = 1001
                multi = 1001
            elif column.es_column == "_id":
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "query": {
                                                      "match_all": {}
                                                  },
                                                  "size": 0
                                              })
                count = cardinality = result.hits.total
                multi = 1
            elif column.es_type == BOOLEAN:
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count":
                                                      _counting_query(column)
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = 2
                multi = 1
            else:
                result = self.es_cluster.post(
                    "/" + es_index + "/_search",
                    data={
                        "aggs": {
                            "count": _counting_query(column),
                            "multi": {
                                "max": {
                                    "script":
                                    "doc[" + quote(column.es_column) +
                                    "].values.size()"
                                }
                            }
                        },
                        "size": 0
                    })
                agg_results = result.aggregations
                count = result.hits.total
                cardinality = coalesce(agg_results.count.value,
                                       agg_results.count._nested.value,
                                       agg_results.count.doc_count)
                multi = int(coalesce(agg_results.multi.value, 1))
                if cardinality == None:
                    Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                self.meta.columns.update({
                    "set": {
                        "count": cardinality,
                        "cardinality": cardinality,
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count
                                        ) or (count >= 1000
                                              and cardinality / count > 0.99):
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts",
                                   table=column.es_index,
                                   field=column.es_column,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                DEBUG and Log.note("{{field}} has {{num}} parts",
                                   field=column.es_index,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {
                        "path": column.nested_path[0]
                    },
                    "aggs": {
                        "_nested": {
                            "terms": {
                                "field": column.es_column
                            }
                        }
                    }
                }
            elif cardinality == 0:
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {
                    "terms": {
                        "field": column.es_column,
                        "size": cardinality
                    }
                }

            result = self.es_cluster.post("/" + es_index + "/_search",
                                          data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            self.meta.columns.update({
                "set": {
                    "count": count,
                    "cardinality": cardinality,
                    "multi": multi,
                    "partitions": parts,
                    "last_updated": Date.now()
                },
                "where": {
                    "eq": {
                        "es_index": column.es_index,
                        "es_column": column.es_column
                    }
                }
            })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            TEST_TABLE = "testdata"
            is_missing_index = any(
                w in e for w in
                ["IndexMissingException", "index_not_found_exception"])
            is_test_table = any(
                column.es_index.startswith(t)
                for t in [TEST_TABLE_PREFIX, TEST_TABLE])
            if is_missing_index and is_test_table:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                self.meta.columns.update({
                    "clear": ".",
                    "where": {
                        "eq": {
                            "es_index": column.es_index
                        }
                    }
                })
                self.index_does_not_exist.add(column.es_index)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "names.\\.": ".",
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get {{col.es_index}}.{{col.es_column}} info",
                    col=column,
                    cause=e)

    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    old_columns = [
                        c for c in self.meta.columns
                        if (c.last_updated == None or c.last_updated <
                            Date.now() - TOO_OLD) and c.jx_type not in STRUCT
                    ]
                    if old_columns:
                        DEBUG and Log.note(
                            "Old columns {{names|json}} last updated {{dates|json}}",
                            names=wrap(old_columns).es_column,
                            dates=[
                                Date(t).format()
                                for t in wrap(old_columns).last_updated
                            ])
                        self.todo.extend(old_columns)
                        # TEST CONSISTENCY
                        for c, d in product(list(self.todo.queue),
                                            list(self.todo.queue)):
                            if c.es_column == d.es_column and c.es_index == d.es_index and c != d:
                                Log.error("")
                    else:
                        DEBUG and Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds))
                if column:
                    if column is THREAD_STOP:
                        continue

                    DEBUG and Log.note("update {{table}}.{{column}}",
                                       table=column.es_index,
                                       column=column.es_column)
                    if column.es_index in self.index_does_not_exist:
                        self.meta.columns.update({
                            "clear": ".",
                            "where": {
                                "eq": {
                                    "es_index": column.es_index
                                }
                            }
                        })
                        continue
                    if column.jx_type in STRUCT or column.es_column.endswith(
                            "." + EXISTS_TYPE):
                        column.last_updated = Date.now()
                        continue
                    elif column.last_updated >= Date.now() - TOO_OLD:
                        continue
                    try:
                        self._update_cardinality(column)
                        (DEBUG
                         and not column.es_index.startswith(TEST_TABLE_PREFIX)
                         ) and Log.note("updated {{column.name}}",
                                        column=column)
                    except Exception as e:
                        Log.warning(
                            "problem getting cardinality for {{column.name}}",
                            column=column,
                            cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)

    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == THREAD_STOP:
                break

            if c.last_updated >= Date.now() - TOO_OLD:
                continue

            self.meta.columns.update({
                "set": {
                    "last_updated": Date.now()
                },
                "clear": [
                    "count",
                    "cardinality",
                    "multi",
                    "partitions",
                ],
                "where": {
                    "eq": {
                        "es_index": c.es_index,
                        "es_column": c.es_column
                    }
                }
            })
            DEBUG and Log.note(
                "Did not get {{col.es_index}}.{{col.es_column}} info", col=c)

    def get_table(self, alias_name):
        with self.meta.tables.locker:
            return wrap(
                [t for t in self.meta.tables.data if t.name == alias_name])

    def get_snowflake(self, fact_table_name):
        return Snowflake(fact_table_name, self)

    def get_schema(self, name):
        if name == "meta.columns":
            return self.meta.columns.schema
        query_path = split_field(name)
        root, rest = query_path[0], join_field(query_path[1:])
        return self.get_snowflake(root).get_schema(rest)
Exemplo n.º 7
0
class Table(BaseFacts):
    @override
    def __init__(
        self,
        table,
        typed,
        read_only,
        sharded,
        container,
        id=Null,
        partition=Null,
        cluster=Null,
        top_level_fields=Null,
        kwargs=None,
    ):
        self.short_name = table
        self.typed = typed
        self.read_only = read_only
        self.cluster = cluster
        self.id = id
        self.top_level_fields = top_level_fields
        self.config = Data(  # USED TO REPLICATE THIS
            typed=typed,
            read_only=read_only,
            sharded=sharded,
            id=id,
            partition=partition,
            cluster=cluster,
            top_level_fields=top_level_fields,
        )

        esc_name = escape_name(table)
        self.full_name = container.full_name + esc_name
        self.alias_view = alias_view = container.client.get_table(text(self.full_name))
        self.partition = partition
        self.container = container

        if not sharded:
            if not read_only and alias_view.table_type == "VIEW":
                Log.error("Expecting a table, not a view")
            self.shard = alias_view
            self._flake = Snowflake.parse(
                alias_view.schema,
                text(self.full_name),
                self.top_level_fields,
                partition,
            )
        else:
            if alias_view.table_type != "VIEW":
                Log.error("Sharded tables require a view")
            current_view = container.client.get_table(text(self.full_name))
            view_sql = current_view.view_query
            shard_name = _extract_primary_shard_name(view_sql)
            try:
                self.shard = container.client.get_table(
                    text(container.full_name + shard_name)
                )
                self._flake = Snowflake.parse(
                    alias_view.schema,
                    text(self.full_name),
                    self.top_level_fields,
                    partition,
                )
            except Exception as e:
                Log.warning("view {{name}} is invalid", name=shard_name, cause=e)
                self._flake = Snowflake.parse(
                    alias_view.schema,
                    text(self.full_name),
                    self.top_level_fields,
                    partition,
                )
                # REMOVE STALE VIEW
                container.client.delete_table(current_view)

                # MAKE NEW VIEW POINTING TO NEW SHARD
                self._create_new_shard()
                container.create_view(
                    self.full_name,
                    self.container.full_name + ApiName(self.shard.table_id),
                )

        self.last_extend = Date.now() - EXTEND_LIMIT
        self.extend_locker = Lock()
        self.extend_queue = Queue("wait for extend")

    def all_records(self):
        """
        MOSTLY FOR TESTING, RETURN ALL RECORDS IN TABLE
        :return:
        """
        return self.sql_query(sql_query({"from": text(self.full_name)}, self.schema))

    def jx_query(self, jx_query):
        docs = self.sql_query(
            sql_query(
                dict_to_data({"from": text(self.full_name)}) | jx_query, self.schema
            )
        )
        data = []
        for d in docs:
            u = untyped(from_data(leaves_to_data(d)))
            data.append(u)

        return Data(data=data, format="list")

    @property
    def schema(self):
        return self._flake

    def sql_query(self, sql):
        """
        :param sql: SQL QUERY
        :return: GENERATOR OF DOCUMENTS as dict
        """
        query_job = self.container.query_and_wait(sql)
        # WE WILL REACH INTO THE _flake, SINCE THIS IS THE FIRST PLACE WE ARE ACTUALLY PULLING RECORDS OUT
        # TODO: WITH MORE CODE THIS LOGIC GOES ELSEWHERE
        _ = self._flake.columns  # ENSURE schema HAS BEEN PROCESSED
        if not self._flake._top_level_fields.keys():
            for row in query_job:
                yield untyped(dict(row))
        else:
            top2deep = {
                name: path for path, name in self._flake._top_level_fields.items()
            }
            for row in query_job:
                output = {}
                doc = dict(row)
                # COPY ALL BUT TOP LEVEL FIELDS
                for k, v in doc.items():
                    deep = top2deep.get(k)
                    if deep is None:
                        output[k] = v
                # INSERT TOP LEVEL FIELDS
                reach = wrap(output)
                for k, p in top2deep.items():
                    try:
                        reach[p] = doc.get(k)
                    except Exception as cause:
                        raise cause
                yield untyped(output)

    @property
    def flake(self):
        return self._flake

    def _create_new_shard(self):
        primary_shard = self.container.create_table(
            table=self.short_name + "_" + "".join(randoms.sample(ALLOWED, 20)),
            sharded=False,
            schema=self._flake.schema,
            kwargs=self.config,
        )
        self.shard = primary_shard.shard

    def extend(self, docs):
        self.extend_queue.extend(docs)
        with self.extend_locker:
            docs = self.extend_queue.pop_all()
            self._extend(docs)

    def _extend(self, rows):
        if self.read_only:
            Log.error("not for writing")
        if len(rows) == 0:
            return

        try:
            update = {}
            with Timer("encoding", verbose=DEBUG):
                while True:
                    typed_rows = []
                    for rownum, row in enumerate(rows):
                        typed_row, more, add_nested = typed_encode(row, self.flake)
                        set_default(update, more)
                        if add_nested:
                            # row HAS NEW NESTED COLUMN!
                            # GO OVER THE rows AGAIN SO "RECORD" GET MAPPED TO "REPEATED"
                            DEBUG and Log.note("New nested documnet found, retrying")
                            break
                        typed_rows.append(typed_row)
                    else:
                        break

            if update or not self.shard:
                # BATCH HAS ADDITIONAL COLUMNS!!
                # WE CAN NOT USE THE EXISTING SHARD, MAKE A NEW ONE:
                self._create_new_shard()
                DEBUG and Log.note(
                    "added new shard with name: {{shard}}", shard=self.shard.table_id
                )
            with Timer(
                "insert {{num}} rows to bq", param={"num": len(rows)}, verbose=DEBUG
            ):
                failures = self.container.client.insert_rows_json(
                    self.shard,
                    json_rows=typed_rows,
                    row_ids=[None] * len(typed_rows),
                    skip_invalid_rows=False,
                    ignore_unknown_values=False,
                )
            if failures:
                if all(r == "stopped" for r in wrap(failures).errors.reason):
                    self._create_new_shard()
                    DEBUG and Log.note(
                        "STOPPED encountered: Added new shard with name: {{shard}}",
                        shard=self.shard.table_id,
                    )
                Log.error(
                    "Got {{num}} failures:\n{{failures|json}}",
                    num=len(failures),
                    failures=failures[:5],
                )
            else:
                self.last_extend = Date.now()
                DEBUG and Log.note("{{num}} rows added", num=len(typed_rows))
        except Exception as cause:
            cause = Except.wrap(cause)
            if (
                len(typed_rows) < 2
                and "Your client has issued a malformed or illegal request." in cause
            ):
                Log.error(
                    "big query complains about:\n{{data|json}}",
                    data=typed_rows,
                    cause=cause,
                )
            elif len(rows) > 1 and (
                "Request payload size exceeds the limit" in cause
                or "An existing connection was forcibly closed by the remote host"
                in cause
                or "Your client has issued a malformed or illegal request." in cause
                or "BrokenPipeError(32, 'Broken pipe')" in cause
                or "ConnectionResetError(104, 'Connection reset by peer')" in cause
            ):
                Log.warning(
                    "problem with batch of size {{size}}", size=len(rows), cause=cause
                )
                batch_size = ceiling(len(rows) / 10)
                try:
                    DEBUG and Log.note(
                        "attempt smaller batches of size {{batch_size}}",
                        batch_size=batch_size,
                    )
                    for _, chunk in jx.chunk(rows, batch_size):
                        self._extend(chunk)
                    return
                except Exception as cause2:
                    Log.error(
                        "smaller batches of size {{batch_size}} did not work",
                        batch_size=batch_size,
                        cause=cause2,
                    )
            elif len(rows) == 1:
                Log.error(
                    "Could not insert document\n{{doc|json|indent}}",
                    doc=rows[0],
                    cause=cause,
                )
            else:
                Log.error("Do not know how to handle", cause=cause)

    def add(self, row):
        self.extend([row])

    def merge_shards(self):
        shards = []
        tables = list(self.container.client.list_tables(self.container.dataset))
        current_view = Null  # VIEW THAT POINTS TO PRIMARY SHARD
        primary_shard_name = None  # PRIMARY SHARD
        api_name = escape_name(self.short_name)

        for table_item in tables:
            table = table_item.reference
            table_api_name = ApiName(table.table_id)
            if text(table_api_name).startswith(text(api_name)):
                if table_api_name == api_name:
                    if table_item.table_type != "VIEW":
                        Log.error("expecting {{table}} to be a view", table=api_name)
                    current_view = self.container.client.get_table(table)
                    view_sql = current_view.view_query
                    primary_shard_name = _extract_primary_shard_name(view_sql)
                elif SUFFIX_PATTERN.match(text(table_api_name)[len(text(api_name)) :]):
                    try:
                        known_table = self.container.client.get_table(table)
                        shards.append(known_table)
                    except Exception as e:
                        Log.warning(
                            "could not merge table {{table}}", table=table, cause=e
                        )

        if not current_view:
            Log.error(
                "expecting {{table}} to be a view pointing to a table", table=api_name
            )

        shard_flakes = [
            Snowflake.parse(
                big_query_schema=shard.schema,
                es_index=text(self.container.full_name + ApiName(shard.table_id)),
                top_level_fields=self.top_level_fields,
                partition=self.partition,
            )
            for shard in shards
        ]
        total_flake = snowflakes.merge(
            shard_flakes,
            es_index=text(self.full_name),
            top_level_fields=self.top_level_fields,
            partition=self.partition,
        )

        for i, s in enumerate(shards):
            if ApiName(s.table_id) == primary_shard_name:
                if total_flake == shard_flakes[i]:
                    # USE THE CURRENT PRIMARY SHARD AS A DESTINATION
                    del shards[i]
                    del shard_flakes[i]
                    break
        else:
            name = self.short_name + "_" + "".join(randoms.sample(ALLOWED, 20))
            primary_shard_name = escape_name(name)
            self.container.create_table(
                table=name,
                schema=total_flake.schema,
                sharded=False,
                read_only=False,
                kwargs=self.config,
            )

        primary_full_name = self.container.full_name + primary_shard_name

        selects = []
        for flake, table in zip(shard_flakes, shards):
            q = ConcatSQL(
                SQL_SELECT,
                JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), gen_select(total_flake, flake)),
                SQL_FROM,
                quote_column(ApiName(table.dataset_id, table.table_id)),
            )
            selects.append(q)

        DEBUG and Log.note(
            "inserting into table {{table}}", table=text(primary_shard_name)
        )
        matched = []
        unmatched = []
        for sel, shard, flake in zip(selects, shards, shard_flakes):
            if flake == total_flake:
                matched.append((sel, shard, flake))
            else:
                unmatched.append((sel, shard, flake))

        # EVERYTHING THAT IS IDENTICAL TO PRIMARY CAN BE MERGED WITH SIMPLE UNION ALL
        if matched:
            for g, merge_chunk in jx.chunk(matched, MAX_MERGE):
                command = ConcatSQL(
                    SQL_INSERT,
                    quote_column(primary_full_name),
                    JoinSQL(
                        SQL_UNION_ALL,
                        (
                            sql_query(
                                {
                                    "from": text(
                                        self.container.full_name
                                        + ApiName(shard.table_id)
                                    )
                                },
                                schema,
                            )
                            for _, shard, schema in merge_chunk
                        ),
                    ),
                )
                DEBUG and Log.note("{{sql}}", sql=text(command))
                job = self.container.query_and_wait(command)
                DEBUG and Log.note(
                    "job {{id}} state = {{state}}", id=job.job_id, state=job.state
                )

                if job.errors:
                    Log.error(
                        "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}",
                        sql=command.sql,
                        reason=job.errors,
                    )
                for _, shard, _ in merge_chunk:
                    self.container.client.delete_table(shard)

        # ALL OTHER SCHEMAS MISMATCH
        for s, shard, _ in unmatched:
            try:
                command = ConcatSQL(SQL_INSERT, quote_column(primary_full_name), s)
                DEBUG and Log.note("{{sql}}", sql=text(command))
                job = self.container.query_and_wait(command)
                DEBUG and Log.note(
                    "from {{shard}}, job {{id}}, state {{state}}",
                    id=job.job_id,
                    shard=shard.table_id,
                    state=job.state,
                )

                if job.errors:
                    if all(
                        " does not have a schema." in m
                        for m in wrap(job.errors).message
                    ):
                        pass  # NOTHING TO DO
                    else:
                        Log.error(
                            "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}",
                            sql=command.sql,
                            reason=job.errors,
                        )

                self.container.client.delete_table(shard)
            except Exception as e:
                Log.warning("failure to merge {{shard}}", shard=shard, cause=e)

        # REMOVE OLD VIEW
        view_full_name = self.container.full_name + api_name
        if current_view:
            self.container.client.delete_table(current_view)

        # CREATE NEW VIEW
        self.container.create_view(view_full_name, primary_full_name)

    def condense(self):
        """
        :return:
        """
        # MAKE NEW SHARD
        partition = JoinSQL(
            SQL_COMMA,
            [
                quote_column(c.es_field)
                for f in listwrap(self.id.field)
                for c in self.flake.leaves(f)
            ],
        )
        order_by = JoinSQL(
            SQL_COMMA,
            [
                ConcatSQL(quote_column(c.es_field), SQL_DESC)
                for f in listwrap(self.id.version)
                for c in self.flake.leaves(f)
            ],
        )
        # WRAP WITH etl.timestamp BEST SELECTION
        self.container.query_and_wait(
            ConcatSQL(
                SQL(  # SOME KEYWORDS: ROWNUM RANK
                    "SELECT * EXCEPT (_rank) FROM (SELECT *, ROW_NUMBER() OVER (PARTITION BY "
                ),
                partition,
                SQL_ORDERBY,
                order_by,
                SQL(") AS _rank FROM "),
                quote_column(self.full_name),
                SQL(") a WHERE _rank=1"),
            )
        )
Exemplo n.º 8
0
class ElasticsearchMetadata(Namespace):
    """
    MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER
    """

    @override
    def __new__(cls, kwargs, *args, **_kwargs):
        es_cluster = elasticsearch.Cluster(kwargs)
        output = known_clusters.get(id(es_cluster))
        if output is None:
            output = object.__new__(cls)
            known_clusters[id(es_cluster)] = output
        return output

    @override
    def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)

        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.index_to_alias = Relation_usingList()

        self.es_metadata = Null
        self.metadata_last_updated = Date.now() - OLD_METADATA

        self.meta = Data()
        self.meta.columns = ColumnList()

        self.alias_to_query_paths = {
            "meta.columns": [['.']],
            "meta.tables": [['.']]
        }
        self.alias_last_updated = {
            "meta.columns": Date.now(),
            "meta.tables": Date.now()
        }
        table_columns = metadata_tables()
        self.meta.tables = ListContainer(
            "meta.tables",
            [
                # TableDesc("meta.columns", None, ".", Date.now()),
                # TableDesc("meta.tables", None, ".", Date.now())
            ],
            jx_base.Schema(".", table_columns)
        )
        self.meta.columns.extend(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return

    @property
    def namespace(self):
        return self.meta.columns.namespace

    @property
    def url(self):
        return self.es_cluster.url / self.default_name.replace(".", "/")

    def _reload_columns(self, table_desc):
        """
        :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS)
        :return:
        """
        # FIND ALL INDEXES OF ALIAS
        es_last_updated = self.es_cluster.metatdata_last_updated

        alias = table_desc.name
        canonical_index = self.es_cluster.get_best_matching_index(alias).index
        update_required = not (table_desc.timestamp < es_last_updated)
        metadata = self.es_cluster.get_metadata(force=update_required)

        indexes = self.index_to_alias.get_domain(alias)
        props = [
            (self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties)
            for i, d in metadata.indices.items()
            if i in indexes
            for t, m in [_get_best_type_from_mapping(d.mappings)]
        ]

        # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT
        dirty = False
        all_comparisions = list(jx.pairwise(props)) + list(jx.pairwise(jx.reverse(props)))
        # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE
        for (i1, t1, p1), (i2, t2, p2) in all_comparisions:
            diff = elasticsearch.diff_schema(p2, p1)
            if not self.settings.read_only:
                for d in diff:
                    dirty = True
                    i1.add_property(*d)
        meta = self.es_cluster.get_metadata(force=dirty).indices[canonical_index]

        data_type, mapping = _get_best_type_from_mapping(meta.mappings)
        mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"}
        self._parse_properties(alias, mapping, meta)
        table_desc.timestamp = es_last_updated

    def _parse_properties(self, alias, mapping, meta):
        abs_columns = elasticsearch.parse_properties(alias, None, mapping.properties)
        if any(c.cardinality == 0 and c.names['.'] != '_id' for c in abs_columns):
            Log.warning(
                "Some columns are not stored {{names}}",
                names=[
                    ".".join((c.es_index, c.names['.']))
                    for c in abs_columns
                    if c.cardinality == 0
                ]
            )

        with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(SELF_PATH)
            query_paths.append(ROOT_PATH)
            self.alias_to_query_paths[alias] = query_paths
            for i in self.index_to_alias.get_domain(alias):
                self.alias_to_query_paths[i] = query_paths

            # ADD RELATIVE NAMES
            for abs_column in abs_columns:
                abs_column.last_updated = None
                abs_column.jx_type = jx_type(abs_column)
                for query_path in query_paths:
                    abs_column.names[query_path[0]] = relative_field(abs_column.names["."], query_path[0])
                self.todo.add(self.meta.columns.add(abs_column))
        pass

    def query(self, _query):
        return self.meta.columns.query(QueryOp(set_default(
            {
                "from": self.meta.columns,
                "sort": ["table", "name"]
            },
            _query.__data__()
        )))

    def _find_alias(self, name):
        if self.metadata_last_updated < self.es_cluster.metatdata_last_updated:
            for a in self.es_cluster.get_aliases():
                self.index_to_alias[a.index] = coalesce(a.alias, a.index)
                self.alias_last_updated.setdefault(a.alias, Date.MIN)
        if name in self.alias_last_updated:
            return name
        else:
            return self.index_to_alias[name]

    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        root_table_name = table_path[0]

        alias = self._find_alias(root_table_name)
        if not alias:
            self.es_cluster.get_metadata(force=True)
            alias = self._find_alias(root_table_name)
            if not alias:
                Log.error("{{table|quote}} does not exist", table=table_name)

        try:
            last_update = MAX([
                self.es_cluster.index_last_updated[i]
                for i in self.index_to_alias.get_domain(alias)
            ])

            table = self.get_table(alias)[0]
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = TableDesc(
                    name=alias,
                    url=None,
                    query_path=['.'],
                    timestamp=Date.MIN
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._reload_columns(table)
            elif force or table.timestamp < last_update:
                self._reload_columns(table)

            columns = self.meta.columns.find(alias, column_name)
            columns = jx.sort(columns, "names.\\.")
            # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
            while len(self.todo) and not all(columns.get("last_updated")):
                if DEBUG:
                    if len(columns) > 10:
                        Log.note("waiting for {{num}} columns to update", num=len([c for c in columns if not c.last_updated]))
                    else:
                        Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated])
                Till(seconds=1).wait()
            return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        return []

    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if column.es_index in self.index_does_not_exist:
            return

        if column.jx_type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.columns),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            if column.es_index == "meta.tables":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.tables),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": {"filter": {"match_all": {}}}
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = max(1001, count)
                multi = 1001
            elif column.es_column == "_id":
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "query": {"match_all": {}},
                    "size": 0
                })
                count = cardinality = result.hits.total
                multi = 1
            elif column.es_type == BOOLEAN:
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": _counting_query(column)
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = 2
                multi = 1
            else:
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": _counting_query(column),
                        "multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}}
                    },
                    "size": 0
                })
                agg_results = result.aggregations
                count = result.hits.total
                cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count)
                multi = int(coalesce(agg_results.multi.value, 1))
                if cardinality == None:
                   Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                self.meta.columns.update({
                    "set": {
                        "count": cardinality,
                        "cardinality": cardinality,
                        "multi": 1,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": Date.now()
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {"path": column.nested_path[0]},
                    "aggs": {"_nested": {"terms": {"field": column.es_column}}}
                }
            elif cardinality == 0:
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}}

            result = self.es_cluster.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            self.meta.columns.update({
                "set": {
                    "count": count,
                    "cardinality": cardinality,
                    "multi": multi,
                    "partitions": parts,
                    "last_updated": Date.now()
                },
                "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
            })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            e = Except.wrap(e)
            TEST_TABLE = "testdata"
            is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"])
            is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE))
            if is_missing_index and is_test_table:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                self.meta.columns.update({
                    "clear": ".",
                    "where": {"eq": {"es_index": column.es_index}}
                })
                self.index_does_not_exist.add(column.es_index)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)

    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    old_columns = [
                        c
                        for c in self.meta.columns
                        if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.jx_type not in STRUCT
                    ]
                    if old_columns:
                        DEBUG and Log.note(
                            "Old columns {{names|json}} last updated {{dates|json}}",
                            names=wrap(old_columns).es_column,
                            dates=[Date(t).format() for t in wrap(old_columns).last_updated]
                        )
                        self.todo.extend(old_columns)
                        # TEST CONSISTENCY
                        for c, d in product(list(self.todo.queue), list(self.todo.queue)):
                            if c.es_column == d.es_column and c.es_index == d.es_index and c != d:
                                Log.error("")
                    else:
                        DEBUG and Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10*MINUTE).seconds))
                if column:
                    if column is THREAD_STOP:
                        continue

                    with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG):
                        if column.es_index in self.index_does_not_exist:
                            self.meta.columns.update({
                                "clear": ".",
                                "where": {"eq": {"es_index": column.es_index}}
                            })
                            continue
                        if column.jx_type in STRUCT or column.es_column.endswith("." + EXISTS_TYPE):
                            column.last_updated = Date.now()
                            continue
                        elif column.last_updated >= Date.now()-TOO_OLD:
                            continue
                        try:
                            self._update_cardinality(column)
                            (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column)
                        except Exception as e:
                            if '"status":404' in e:
                                self.meta.columns.update({
                                    "clear": ".",
                                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                                })
                            else:
                                Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)

    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == THREAD_STOP:
                break

            if c.last_updated >= Date.now()-TOO_OLD:
                continue

            with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": c}, silent=not DEBUG, too_long=0.05):
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })

    def get_table(self, name):
        if name == "meta.columns":
            return self.meta.columns

            # return self.meta.columns
        with self.meta.tables.locker:
            return wrap([t for t in self.meta.tables.data if t.name == name])

    def get_snowflake(self, fact_table_name):
        return Snowflake(fact_table_name, self)

    def get_schema(self, name):
        if name == "meta.columns":
            return self.meta.columns.schema
        query_path = split_field(name)
        root, rest = query_path[0], join_field(query_path[1:])
        return self.get_snowflake(root).get_schema(rest)
Exemplo n.º 9
0
class ElasticsearchMetadata(Namespace):
    """
    MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER
    """
    @override
    def __new__(cls, kwargs, *args, **_kwargs):
        es_cluster = elasticsearch.Cluster(kwargs)
        output = known_clusters.get(id(es_cluster))
        if output is None:
            output = object.__new__(cls)
            known_clusters[id(es_cluster)] = output
        return output

    @override
    def __init__(self,
                 host,
                 index,
                 sql_file='metadata.sqlite',
                 alias=None,
                 name=None,
                 port=9200,
                 kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)

        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.index_to_alias = {}

        self.es_metadata = Null
        self.metadata_last_updated = Date.now() - OLD_METADATA

        self.meta = Data()
        self.meta.columns = ColumnList(URL(self.es_cluster.settings.host).host)

        self.alias_to_query_paths = {
            "meta.columns": [ROOT_PATH],
            "meta.tables": [ROOT_PATH]
        }
        self.alias_last_updated = {
            "meta.columns": Date.now(),
            "meta.tables": Date.now()
        }
        table_columns = metadata_tables()
        self.meta.tables = ListContainer("meta.tables", [],
                                         jx_base.Schema(".", table_columns))
        self.meta.columns.extend(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("not refresh metadata", self.not_monitor)
        return

    @property
    def namespace(self):
        return self.meta.columns.namespace

    @property
    def url(self):
        return self.es_cluster.url / self.default_name.replace(".", "/")

    def _reload_columns(self, table_desc):
        """
        :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS)
        :return:
        """
        # FIND ALL INDEXES OF ALIAS
        es_last_updated = self.es_cluster.metatdata_last_updated

        alias = table_desc.name
        canonical_index = self.es_cluster.get_best_matching_index(alias).index
        es_metadata_update_required = not (table_desc.timestamp <
                                           es_last_updated)
        metadata = self.es_cluster.get_metadata(
            force=es_metadata_update_required)

        props = [(self.es_cluster.get_index(index=i, type=t,
                                            debug=DEBUG), t, m.properties)
                 for i, d in metadata.indices.items() if alias in d.aliases
                 for t, m in [_get_best_type_from_mapping(d.mappings)]]

        # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT
        dirty = False
        all_comparisions = list(jx.pairwise(props)) + list(
            jx.pairwise(jx.reverse(props)))
        # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE
        for (i1, t1, p1), (i2, t2, p2) in all_comparisions:
            diff = elasticsearch.diff_schema(p2, p1)
            if not self.settings.read_only:
                for d in diff:
                    dirty = True
                    i1.add_property(*d)
        meta = self.es_cluster.get_metadata(
            force=dirty).indices[canonical_index]

        data_type, mapping = _get_best_type_from_mapping(meta.mappings)
        mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"}
        columns = self._parse_properties(alias, mapping)
        table_desc.timestamp = es_last_updated
        return columns

    def _parse_properties(self, alias, mapping):
        abs_columns = elasticsearch.parse_properties(alias, ".", ROOT_PATH,
                                                     mapping.properties)
        if DEBUG and any(c.cardinality == 0 and c.name != '_id'
                         for c in abs_columns):
            Log.warning(
                "Some columns are not stored in {{url}} {{index|quote}} table:\n{{names}}",
                url=self.es_cluster.url,
                index=alias,
                names=[
                    ".".join((c.es_index, c.name)) for c in abs_columns
                    if c.cardinality == 0
                ])

        with Timer("upserting {{num}} columns", {"num": len(abs_columns)},
                   silent=not DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns
                           if c.es_type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(".")
            query_paths.append(ROOT_PATH)

            # ENSURE ALL TABLES HAVE THE QUERY PATHS SET
            self.alias_to_query_paths[alias] = query_paths
            for i, a in self.index_to_alias.items():
                if a == alias:
                    self.alias_to_query_paths[i] = query_paths

            # ENSURE COLUMN HAS CORRECT jx_type
            # PICK DEEPEST NESTED PROPERTY AS REPRESENTATIVE
            output = []
            best = {}
            for abs_column in abs_columns:
                abs_column.jx_type = jx_type(abs_column)
                if abs_column.jx_type not in STRUCT:
                    clean_name = unnest_path(abs_column.name)
                    other = best.get(clean_name)
                    if other:
                        if len(other.nested_path) < len(
                                abs_column.nested_path):
                            output.remove(other)
                            self.meta.columns.update({
                                "clear": ".",
                                "where": {
                                    "eq": {
                                        "es_column": other.es_column,
                                        "es_index": other.es_index
                                    }
                                }
                            })
                        else:
                            continue
                    best[clean_name] = abs_column
                output.append(abs_column)

            # REGISTER ALL COLUMNS
            canonicals = []
            for abs_column in output:
                canonical = self.meta.columns.add(abs_column)
                canonicals.append(canonical)

            self.todo.extend(canonicals)
            return canonicals

    def query(self, _query):
        return self.meta.columns.query(
            QueryOp(
                set_default(
                    {
                        "from": self.meta.columns,
                        "sort": ["table", "name"]
                    }, _query.__data__())))

    def _find_alias(self, name):
        if self.metadata_last_updated < self.es_cluster.metatdata_last_updated:
            for a in self.es_cluster.get_aliases():
                self.index_to_alias[a.index] = coalesce(a.alias, a.index)
                self.alias_last_updated.setdefault(a.alias, Date.MIN)
        if name in self.alias_last_updated:
            return name
        else:
            return self.index_to_alias.get(name)

    def get_columns(self,
                    table_name,
                    column_name=None,
                    after=None,
                    timeout=None):
        """
        RETURN METADATA COLUMNS

        :param table_name: TABLE WE WANT COLUMNS FOR
        :param column_name:  OPTIONAL NAME, IF INTERESTED IN ONLY ONE COLUMN
        :param after: FORCE LOAD, WAITING FOR last_updated TO BE AFTER THIS TIME
        :param timeout: Signal; True when should give up
        :return:
        """
        DEBUG and after and Log.note("getting columns for after {{time}}",
                                     time=after)
        table_path = split_field(table_name)
        root_table_name = table_path[0]

        alias = self._find_alias(root_table_name)
        if not alias:
            self.es_cluster.get_metadata(force=True)
            alias = self._find_alias(root_table_name)
            if not alias:
                Log.error("{{table|quote}} does not exist", table=table_name)

        try:
            table = self.get_table(alias)[0]
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = TableDesc(name=alias,
                                  url=None,
                                  query_path=["."],
                                  timestamp=Date.MIN)
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                columns = self._reload_columns(table)
                DEBUG and Log.note("columns from reload")
            elif after or table.timestamp < self.es_cluster.metatdata_last_updated:
                columns = self._reload_columns(table)
                DEBUG and Log.note("columns from reload")
            else:
                columns = self.meta.columns.find(alias, column_name)
                DEBUG and Log.note("columns from find()")

            DEBUG and Log.note("columns are {{ids}}",
                               ids=[id(c) for c in columns])

            columns = jx.sort(columns, "name")

            if after is None:
                return columns  # DO NOT WAIT FOR COMPLETE COLUMNS

            # WAIT FOR THE COLUMNS TO UPDATE
            while True:
                pending = [
                    c for c in columns if after >= c.last_updated or (
                        c.cardinality == None and c.jx_type not in STRUCT)
                ]
                if not pending:
                    break
                if timeout:
                    Log.error("trying to gets columns timed out")
                if DEBUG:
                    if len(pending) > 10:
                        Log.note(
                            "waiting for {{num}} columns to update by {{timestamp}}",
                            num=len(pending),
                            timestamp=after)
                    else:
                        Log.note(
                            "waiting for columns to update by {{timestamp}}; {{columns|json}}",
                            timestamp=after,
                            columns=[
                                c.es_index + "." + c.es_column + " id=" +
                                text_type(id(c)) for c in pending
                            ])
                Till(seconds=1).wait()
            return columns
        except Exception as e:
            Log.error("Failure to get columns for {{table}}",
                      table=table_name,
                      cause=e)

        return []

    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        now = Date.now()
        if column.es_index in self.index_does_not_exist:
            return

        if column.jx_type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                partitions = jx.sort([
                    g[column.es_column]
                    for g, _ in jx.groupby(self.meta.columns, column.es_column)
                    if g[column.es_column] != None
                ])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.columns),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            if column.es_index == "meta.tables":
                partitions = jx.sort([
                    g[column.es_column]
                    for g, _ in jx.groupby(self.meta.tables, column.es_column)
                    if g[column.es_column] != None
                ])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.tables),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [
                cc for cc in self.meta.columns
                if cc.es_column == column.es_column and cc.es_type == "text"
            ]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count": {
                                                          "filter": {
                                                              "match_all": {}
                                                          }
                                                      }
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = max(1001, count)
                multi = 1001
            elif column.es_column == "_id":
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "query": {
                                                      "match_all": {}
                                                  },
                                                  "size": 0
                                              })
                count = cardinality = result.hits.total
                multi = 1
            elif column.es_type == BOOLEAN:
                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data={
                                                  "aggs": {
                                                      "count":
                                                      _counting_query(column)
                                                  },
                                                  "size": 0
                                              })
                count = result.hits.total
                cardinality = 2

                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts",
                                   table=column.es_index,
                                   field=column.es_column,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": [False, True],
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            else:
                es_query = {
                    "aggs": {
                        "count": _counting_query(column),
                        "_filter": {
                            "aggs": {
                                "multi": {
                                    "max": {
                                        "script":
                                        "doc[" + quote(column.es_column) +
                                        "].values.size()"
                                    }
                                }
                            },
                            "filter": {
                                "bool": {
                                    "should": [{
                                        "range": {
                                            "etl.timestamp.~n~": {
                                                "gte": (Date.today() - WEEK)
                                            }
                                        }
                                    }, {
                                        "bool": {
                                            "must_not": {
                                                "exists": {
                                                    "field":
                                                    "etl.timestamp.~n~"
                                                }
                                            }
                                        }
                                    }]
                                }
                            }
                        }
                    },
                    "size": 0
                }

                result = self.es_cluster.post("/" + es_index + "/_search",
                                              data=es_query)
                agg_results = result.aggregations
                count = result.hits.total
                cardinality = coalesce(agg_results.count.value,
                                       agg_results.count._nested.value,
                                       agg_results.count.doc_count)
                multi = int(coalesce(agg_results._filter.multi.value, 1))
                if cardinality == None:
                    Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                self.meta.columns.update({
                    "set": {
                        "count": cardinality,
                        "cardinality": cardinality,
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count
                                        ) or (count >= 1000
                                              and cardinality / count > 0.99):
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts",
                                   table=column.es_index,
                                   field=column.es_column,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts",
                                   table=column.es_index,
                                   field=column.es_column,
                                   num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {
                        "path": column.nested_path[0]
                    },
                    "aggs": {
                        "_nested": {
                            "terms": {
                                "field": column.es_column
                            }
                        }
                    }
                }
            elif cardinality == 0:  # WHEN DOES THIS HAPPEN?
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {
                    "terms": {
                        "field": column.es_column,
                        "size": cardinality
                    }
                }

            result = self.es_cluster.post("/" + es_index + "/_search",
                                          data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            DEBUG and Log.note(
                "update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}",
                id=id(column),
                column=column,
                time=now)
            self.meta.columns.update({
                "set": {
                    "count": count,
                    "cardinality": cardinality,
                    "multi": multi,
                    "partitions": parts,
                    "last_updated": now
                },
                "where": {
                    "eq": {
                        "es_index": column.es_index,
                        "es_column": column.es_column
                    }
                }
            })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            e = Except.wrap(e)
            TEST_TABLE = "testdata"
            is_missing_index = any(
                w in e for w in
                ["IndexMissingException", "index_not_found_exception"])
            is_test_table = column.es_index.startswith(
                (TEST_TABLE_PREFIX, TEST_TABLE))
            if is_missing_index:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                Log.warning("Missing index {{col.es_index}}",
                            col=column,
                            cause=e)
                self.meta.columns.update({
                    "clear": ".",
                    "where": {
                        "eq": {
                            "es_index": column.es_index
                        }
                    }
                })
                self.index_does_not_exist.add(column.es_index)
            elif "No field found for" in e:
                self.meta.columns.update({
                    "clear": ".",
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get column {{col.es_index}}.{{col.es_column}} info",
                    col=column,
                    cause=e)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": now
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "es_index": column.es_index,
                            "es_column": column.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get {{col.es_index}}.{{col.es_column}} info",
                    col=column,
                    cause=e)

    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    old_columns = [
                        c for c in self.meta.columns
                        if ((c.last_updated < Date.now() -
                             MAX_COLUMN_METADATA_AGE) or c.cardinality == None)
                        and c.jx_type not in STRUCT
                    ]
                    if old_columns:
                        DEBUG and Log.note(
                            "Old columns {{names|json}} last updated {{dates|json}}",
                            names=wrap(old_columns).es_column,
                            dates=[
                                Date(t).format()
                                for t in wrap(old_columns).last_updated
                            ])
                        self.todo.extend(old_columns)
                    else:
                        DEBUG and Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds))
                if column:
                    if column is THREAD_STOP:
                        continue

                    with Timer("update {{table}}.{{column}}",
                               param={
                                   "table": column.es_index,
                                   "column": column.es_column
                               },
                               silent=not DEBUG):
                        if column.es_index in self.index_does_not_exist:
                            DEBUG and Log.note(
                                "{{column.es_column}} does not exist",
                                column=column)
                            self.meta.columns.update({
                                "clear": ".",
                                "where": {
                                    "eq": {
                                        "es_index": column.es_index
                                    }
                                }
                            })
                            continue
                        if column.jx_type in STRUCT or split_field(
                                column.es_column)[-1] == EXISTS_TYPE:
                            DEBUG and Log.note(
                                "{{column.es_column}} is a struct",
                                column=column)
                            column.last_updated = Date.now()
                            continue
                        elif column.last_updated > Date.now(
                        ) - TOO_OLD and column.cardinality is not None:
                            # DO NOT UPDATE FRESH COLUMN METADATA
                            DEBUG and Log.note(
                                "{{column.es_column}} is still fresh ({{ago}} ago)",
                                column=column,
                                ago=(Date.now() -
                                     Date(column.last_updated)).seconds)
                            continue
                        try:
                            self._update_cardinality(column)
                            (DEBUG and
                             not column.es_index.startswith(TEST_TABLE_PREFIX)
                             ) and Log.note("updated {{column.name}}",
                                            column=column)
                        except Exception as e:
                            if '"status":404' in e:
                                self.meta.columns.update({
                                    "clear": ".",
                                    "where": {
                                        "eq": {
                                            "es_index": column.es_index,
                                            "es_column": column.es_column
                                        }
                                    }
                                })
                            else:
                                Log.warning(
                                    "problem getting cardinality for {{column.name}}",
                                    column=column,
                                    cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)

    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            column = self.todo.pop()
            if column == THREAD_STOP:
                break

            if column.jx_type in STRUCT or split_field(
                    column.es_column)[-1] == EXISTS_TYPE:
                DEBUG and Log.note("{{column.es_column}} is a struct",
                                   column=column)
                column.last_updated = Date.now()
                continue
            elif column.last_updated > Date.now(
            ) - TOO_OLD and column.cardinality is not None:
                # DO NOT UPDATE FRESH COLUMN METADATA
                DEBUG and Log.note(
                    "{{column.es_column}} is still fresh ({{ago}} ago)",
                    column=column,
                    ago=(Date.now() - Date(column.last_updated)).seconds)
                continue

            with Timer("Update {{col.es_index}}.{{col.es_column}}",
                       param={"col": column},
                       silent=not DEBUG,
                       too_long=0.05):
                if untype_path(column.name) in ["build.type", "run.type"]:
                    try:
                        self._update_cardinality(column)
                    except Exception as e:
                        Log.warning(
                            "problem getting cardinality for {{column.name}}",
                            column=column,
                            cause=e)
                else:
                    column.last_updated = Date.now()

    def get_table(self, name):
        if name == "meta.columns":
            return self.meta.columns

        with self.meta.tables.locker:
            return wrap([t for t in self.meta.tables.data if t.name == name])

    def get_snowflake(self, fact_table_name):
        return Snowflake(fact_table_name, self)

    def get_schema(self, name):
        if name == "meta.columns":
            return self.meta.columns.schema
        if name == "meta.tables":
            return self.meta.tables
        root, rest = tail_field(name)
        return self.get_snowflake(root).get_schema(rest)
Exemplo n.º 10
0
class Extract(object):
    @override
    def __init__(self, kwargs=None):
        self.settings = kwargs
        self.schema = SnowflakeSchema(self.settings.snowflake)
        self._extract = extract = kwargs.extract

        # SOME PREP
        get_git_revision()

        # VERIFY WE DO NOT HAVE TOO MANY OTHER PROCESSES WORKING ON STUFF
        with MySQL(**kwargs.snowflake.database) as db:
            processes = None
            try:
                processes = jx.filter(
                    db.query("show processlist"), {
                        "and": [{
                            "neq": {
                                "Command": "Sleep"
                            }
                        }, {
                            "neq": {
                                "Info": "show processlist"
                            }
                        }]
                    })
            except Exception as e:
                Log.warning("no database", cause=e)

            if processes:
                if DEBUG:
                    Log.warning("Processes are running\n{{list|json}}",
                                list=processes)
                else:
                    Log.error("Processes are running\n{{list|json}}",
                              list=processes)

        extract.type = listwrap(extract.type)
        extract.start = listwrap(extract.start)
        extract.batch = listwrap(extract.batch)
        extract.field = listwrap(extract.field)
        if any(
                len(extract.type) != len(other)
                for other in [extract.start, extract.batch, extract.field]):
            Log.error(
                "Expecting same number of dimensions for `type`, `start`, `batch`, and `field` in the `extract` inner object"
            )
        for i, t in enumerate(extract.type):
            if t == "time":
                extract.start[i] = Date(extract.start[i])
                extract.batch[i] = Duration(extract.batch[i])
            elif t == "number":
                pass
            else:
                Log.error('Expecting `extract.type` to be "number" or "time"')

        extract.threads = coalesce(extract.threads, 1)
        self.done_pulling = Signal()
        self.queue = Queue("all batches",
                           max=2 * coalesce(extract.threads, 1),
                           silent=True)

        self.bucket = s3.Bucket(self.settings.destination)
        self.notify = aws.Queue(self.settings.notify)
        Thread.run("get records", self.pull_all_remaining)

    def pull_all_remaining(self, please_stop):
        try:
            try:
                content = File(self.settings.extract.last).read_json()
                if len(content) == 1:
                    Log.note("Got a manually generated file {{filename}}",
                             filename=self.settings.extract.last)
                    start_point = tuple(content[0])
                    first_value = [
                        self._extract.start[0] + (start_point[0] * DAY),
                        start_point[1]
                    ]
                else:
                    Log.note("Got a machine generated file {{filename}}",
                             filename=self.settings.extract.last)
                    start_point, first_value = content
                    start_point = tuple(start_point)
                Log.note("First value is {{start1|date}}, {{start2}}",
                         start1=first_value[0],
                         start2=first_value[1])
            except Exception as _:
                Log.error(
                    "Expecting a file {{filename}} with the last good S3 bucket etl id in array form eg: [[954, 0]]",
                    filename=self.settings.extract.last)
                start_point = tuple(self._extract.start)
                first_value = Null

            counter = Counter(start=0)
            for t, s, b, f, i in reversed(
                    zip(self._extract.type, self._extract.start,
                        self._extract.batch,
                        listwrap(first_value) + DUMMY_LIST,
                        range(len(self._extract.start)))):
                if t == "time":
                    counter = DurationCounter(start=s,
                                              duration=b,
                                              child=counter)
                    first_value[i] = Date(f)
                else:
                    counter = BatchCounter(start=s, size=b, child=counter)

            batch_size = self._extract.batch.last(
            ) * 2 * self.settings.extract.threads
            with MySQL(**self.settings.snowflake.database) as db:
                while not please_stop:
                    sql = self._build_list_sql(db, first_value, batch_size + 1)
                    pending = []
                    counter.reset(start_point)
                    with Timer("Grab a block of ids for processing"):
                        with closing(db.db.cursor()) as cursor:
                            acc = []
                            cursor.execute(sql)
                            count = 0
                            for row in cursor:
                                detail_key = counter.next(row)
                                key = tuple(detail_key[:-1])
                                count += 1
                                if key != start_point:
                                    if first_value:
                                        if not acc:
                                            Log.error(
                                                "not expected, {{filename}} is probably set too far in the past",
                                                filename=self.settings.extract.
                                                last)
                                        pending.append({
                                            "start_point": start_point,
                                            "first_value": first_value,
                                            "data": acc
                                        })
                                    acc = []
                                    start_point = key
                                    first_value = row
                                acc.append(
                                    row[-1]
                                )  # ASSUME LAST COLUMN IS THE FACT TABLE id
                    Log.note("adding {{num}} for processing", num=len(pending))
                    self.queue.extend(pending)

                    if count < batch_size:
                        self.queue.add(THREAD_STOP)
                        break
        except Exception as e:
            Log.warning("Problem pulling data", cause=e)
        finally:
            self.done_pulling.go()
            Log.note("pulling new data is done")

    def _build_list_sql(self, db, first, batch_size):
        # TODO: ENSURE THE LAST COLUMN IS THE id
        if first:
            dim = len(self._extract.field)
            where = SQL_OR.join(
                sql_iso(
                    sql_and(
                        quote_column(f) + ineq(i, e, dim) +
                        db.quote_value(Date(v) if t == "time" else v)
                        for e, (f, v, t) in enumerate(
                            zip(self._extract.field[0:i + 1:], first,
                                self._extract.type[0:i + 1:]))))
                for i in range(dim))
        else:
            where = SQL_TRUE

        selects = []
        for t, f in zip(self._extract.type, self._extract.field):
            if t == "time":
                selects.append(
                    "CAST" +
                    sql_iso(sql_alias(quote_column(f), SQL("DATETIME(6)"))))
            else:
                selects.append(quote_column(f))
        sql = (SQL_SELECT + sql_list(selects) + SQL_FROM +
               self.settings.snowflake.fact_table + SQL_WHERE + where +
               SQL_ORDERBY +
               sql_list(quote_column(f) for f in self._extract.field) +
               SQL_LIMIT + db.quote_value(batch_size))
        return sql

    def extract(self, db, start_point, first_value, data, please_stop):
        Log.note(
            "Starting scan of {{table}} at {{id}} and sending to batch {{start_point}}",
            table=self.settings.snowflake.fact_table,
            id=first_value,
            start_point=start_point)

        id = quote_column(self._extract.field.last())
        ids = (SQL_SELECT + id + SQL_FROM +
               self.settings.snowflake.fact_table + SQL_WHERE + id + " in " +
               sql_iso(sql_list(map(db.quote_value, data))))
        sql = self.schema.get_sql(ids)

        with Timer("Sending SQL"):
            cursor = db.query(sql, stream=True, row_tuples=True)

        extract = self.settings.extract
        fact_table = self.settings.snowflake.fact_table

        with TempFile() as temp_file:
            parent_etl = None
            for s in start_point:
                parent_etl = {"id": s, "source": parent_etl}
            parent_etl["revision"] = get_git_revision()
            parent_etl["machine"] = machine_metadata

            def append(value, i):
                """
                :param value: THE DOCUMENT TO ADD
                :return: PleaseStop
                """
                temp_file.append(
                    convert.value2json({
                        fact_table: elasticsearch.scrub(value),
                        "etl": {
                            "id": i,
                            "source": parent_etl,
                            "timestamp": Date.now()
                        }
                    }))

            with Timer("assemble data"):
                self.construct_docs(cursor, append, please_stop)

            # WRITE TO S3
            s3_file_name = ".".join(map(text_type, start_point))
            with Timer("write to destination {{filename}}",
                       param={"filename": s3_file_name}):
                if not isinstance(self.settings.destination, text_type):
                    destination = self.bucket.get_key(s3_file_name,
                                                      must_exist=False)
                    destination.write_lines(temp_file)
                else:
                    destination = File(self.settings.destination)
                    destination.write(
                        convert.value2json(
                            [convert.json2value(o) for o in temp_file],
                            pretty=True))
                    return False

        # NOTIFY SQS
        now = Date.now()
        self.notify.add({
            "bucket": self.settings.destination.bucket,
            "key": s3_file_name,
            "timestamp": now.unix,
            "date/time": now.format()
        })

        # SUCCESS!!
        File(extract.last).write(convert.value2json([start_point,
                                                     first_value]))

    def construct_docs(self, cursor, append, please_stop):
        """
        :param cursor: ITERATOR OF RECORDS
        :param append: METHOD TO CALL WITH CONSTRUCTED DOCUMENT
        :return: (count, first, next, next_key)
        number of documents added
        the first document in the batch
        the first document of the next batch
        """
        null_values = set(self.settings.snowflake.null_values) | {None}

        count = 0
        rownum = 0
        columns = tuple(wrap(c) for c in self.schema.columns)
        with Timer("Downloading from MySQL"):
            curr_record = Null
            for rownum, row in enumerate(cursor):
                if please_stop:
                    Log.error("Got `please_stop` signal")

                nested_path = []
                next_record = None

                for c, value in zip(columns, row):
                    if value in null_values:
                        continue
                    if len(nested_path) < len(c.nested_path):
                        nested_path = unwrap(c.nested_path)
                        next_record = Data()
                    next_record[c.put] = value

                if len(nested_path) > 1:
                    path = nested_path[-2]
                    children = curr_record[path]
                    if children == None:
                        children = curr_record[path] = wrap([])
                    if len(nested_path) > 2:
                        parent_path = path
                        for path in list(reversed(nested_path[0:-2:])):
                            parent = children.last()
                            relative_path = relative_field(path, parent_path)
                            children = parent[relative_path]
                            if children == None:
                                children = parent[relative_path] = wrap([])
                            parent_path = path

                    children.append(next_record)
                    continue

                if curr_record == next_record:
                    Log.error("not expected")

                if curr_record:
                    append(curr_record["id"], count)
                    count += 1
                curr_record = next_record

            # DEAL WITH LAST RECORD
            if curr_record:
                append(curr_record["id"], count)
                count += 1

        Log.note("{{num}} documents ({{rownum}} db records)",
                 num=count,
                 rownum=rownum)
Exemplo n.º 11
0
class FromESMetadata(Schema):
    """
    QUERY THE METADATA
    """

    def __new__(cls, *args, **kwargs):
        global singlton
        if singlton:
            return singlton
        else:
            singlton = object.__new__(cls)
            return singlton

    @override
    def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(kwargs=kwargs)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.last_es_metadata = Date.now()-OLD_METADATA

        self.meta=Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer("meta.tables", [], wrap({c.name: c for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return

    @property
    def query_path(self):
        return None

    @property
    def url(self):
        return self.default_es.path + "/" + self.default_name.replace(".", "/")

    def get_table(self, table_name):
        with self.meta.tables.locker:
            return wrap([t for t in self.meta.tables.data if t.name == table_name])

    def _upsert_column(self, c):
        # ASSUMING THE  self.meta.columns.locker IS HAD
        existing_columns = self.meta.columns.find(c.table, c.name)
        if not existing_columns:
            self.meta.columns.add(c)
            self.todo.add(c)

            if ENABLE_META_SCAN:
                if DEBUG:
                    Log.note("todo: {{table}}::{{column}}", table=c.table, column=c.es_column)
                # MARK meta.columns AS DIRTY TOO
                cols = self.meta.columns.find("meta.columns", None)
                for cc in cols:
                    cc.partitions = cc.cardinality = None
                    cc.last_updated = Date.now()
                self.todo.extend(cols)
        else:
            canonical = existing_columns[0]
            if canonical.relative and not c.relative:
                return  # RELATIVE COLUMNS WILL SHADOW ABSOLUTE COLUMNS

            for key in Column.__slots__:
                canonical[key] = c[key]
            if DEBUG:
                Log.note("todo: {{table}}::{{column}}", table=canonical.table, column=canonical.es_column)
            self.todo.add(canonical)

    def _get_columns(self, table=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        meta = self.es_metadata.indices[table]
        if not meta or self.last_es_metadata < Date.now() - OLD_METADATA:
            self.es_metadata = self.default_es.get_metadata(force=True)
            meta = self.es_metadata.indices[table]
        self._parse_properties(meta.index, Data(properties={"_id": {"type": "string", "index": "not_analyzed"}}), meta)
        for _, properties in meta.mappings.items():
            self._parse_properties(meta.index, properties, meta)

    def _parse_properties(self, abs_index, properties, meta):
        abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties)
        abs_columns = abs_columns.filter(  # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED
            lambda r: not r.es_column.startswith("other.") and
                      not r.es_column.startswith("previous_values.cf_") and
                      not r.es_index.startswith("debug") and
                      r.es_column.find("=")==-1 and
                      r.es_column.find(" ")==-1
        )
        with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG):
            def add_column(c, query_path):
                c.last_updated = Date.now()
                c.table = join_field([c.es_index]+split_field(query_path[0]))

                with self.meta.columns.locker:
                    self._upsert_column(c)
                    for alias in meta.aliases:
                        c = copy(c)
                        c.table = join_field([alias]+split_field(query_path[0]))
                        self._upsert_column(c)

            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(".")
            query_paths.append(ROOT_PATH)

            # ADD RELATIVE COLUMNS
            for abs_column in abs_columns:
                full_path = abs_column.nested_path
                abs_depth = len(full_path)-1
                abs_parent = full_path[1] if abs_depth else ""

                for query_path in query_paths:
                    rel_depth = len(query_path)-1
                    rel_parent = query_path[0]
                    rel_column = copy(abs_column)
                    rel_column.relative = True

                    add_column(copy(abs_column), query_path)

                    if rel_parent == ".":
                        add_column(rel_column, query_path)
                    elif abs_column.es_column.startswith(rel_parent+"."):
                        rel_column.name = abs_column.es_column[len(rel_parent)+1:]
                        add_column(rel_column, query_path)
                    elif abs_column.es_column == rel_parent:
                        rel_column.name = "."
                        add_column(rel_column, query_path)
                    elif not abs_parent:
                        # THIS RELATIVE NAME (..o) ALSO NEEDS A RELATIVE NAME (o)
                        # AND THEN REMOVE THE SHADOWED
                        rel_column.name = "." + ("." * (rel_depth - abs_depth)) + abs_column.es_column
                        add_column(rel_column, query_path)
                    elif rel_parent.startswith(abs_parent+"."):
                        rel_column.name = "." + ("." * (rel_depth - abs_depth)) + abs_column.es_column
                        add_column(rel_column, query_path)
                    elif rel_parent != abs_parent:
                        # SIBLING NESTED PATHS ARE INVISIBLE
                        pass
                    else:
                        Log.error("logic error")

    def query(self, _query):
        return self.meta.columns.query(QueryOp(set_default(
            {
                "from": self.meta.columns,
                "sort": ["table", "name"]
            },
            _query.__data__()
        )))

    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            short_name = join_field(split_field(table_name)[0:1])
            table = self.get_table(short_name)[0]

            if not table:
                table = Table(
                    name=short_name,
                    url=None,
                    query_path=None,
                    timestamp=Date.now()
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=short_name)
            elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=short_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(table_name, column_name)
            if columns:
                columns = jx.sort(columns, "name")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    if DEBUG:
                        Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated])
                    Till(seconds=1).wait()
                return columns
        except Exception, e:
            Log.error("Not expected", cause=e)

        if column_name:
            Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=column_name)
        else:
            self._get_columns(table=table_name)
            Log.error("no columns for {{table}}?!", table=table_name)
Exemplo n.º 12
0
class FromESMetadata(Schema):
    """
    QUERY THE METADATA
    """

    def __new__(cls, *args, **kwargs):
        global singlton
        if singlton:
            return singlton
        else:
            singlton = object.__new__(cls)
            return singlton

    @override
    def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(kwargs=kwargs)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.last_es_metadata = Date.now()-OLD_METADATA

        self.meta=Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return

    @property
    def query_path(self):
        return None

    @property
    def url(self):
        return self.default_es.path + "/" + self.default_name.replace(".", "/")

    def get_table(self, table_name):
        with self.meta.tables.locker:
            return wrap([t for t in self.meta.tables.data if t.name == table_name])

    def _upsert_column(self, c):
        # ASSUMING THE  self.meta.columns.locker IS HAD
        existing_columns = self.meta.columns.find(c.es_index, c.names["."])
        if not existing_columns:
            self.meta.columns.add(c)
            self.todo.add(c)

            if ENABLE_META_SCAN:
                if DEBUG:
                    Log.note("todo: {{table}}::{{column}}", table=c.es_index, column=c.es_column)
                # MARK meta.columns AS DIRTY TOO
                cols = self.meta.columns.find("meta.columns", None)
                for cc in cols:
                    cc.partitions = cc.cardinality = None
                    cc.last_updated = Date.now()
                self.todo.extend(cols)
        else:
            canonical = existing_columns[0]
            if canonical is not c:
                set_default(c.names, canonical.names)
                for key in Column.__slots__:
                    canonical[key] = c[key]
            if DEBUG:
                Log.note("todo: {{table}}::{{column}}", table=canonical.es_index, column=canonical.es_column)
            self.todo.add(canonical)

    def _get_columns(self, table=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        table_path = split_field(table)
        es_index = table_path[0]
        query_path = join_field(table_path[1:])
        meta = self.es_metadata.indices[es_index]
        if not meta or self.last_es_metadata < Date.now() - OLD_METADATA:
            self.es_metadata = self.default_es.get_metadata(force=True)
            meta = self.es_metadata.indices[es_index]

        for _, properties in meta.mappings.items():
            properties.properties["_id"] = {"type": "string", "index": "not_analyzed"}
            self._parse_properties(meta.index, properties, meta)

    def _parse_properties(self, abs_index, properties, meta):
        # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND
        # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES
        abs_columns = _elasticsearch.parse_properties(abs_index, None, properties.properties)
        abs_columns = abs_columns.filter(  # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED
            lambda r: not r.es_column.startswith("other.") and
                      not r.es_column.startswith("previous_values.cf_") and
                      not r.es_index.startswith("debug") and
                      r.es_column.find("=") == -1 and
                      r.es_column.find(" ") == -1
        )

        def add_column(c, query_path):
            c.last_updated = Date.now()
            if query_path[0] != ".":
                c.names[query_path[0]] = relative_field(c.names["."], query_path[0])

            with self.meta.columns.locker:
                self._upsert_column(c)
                for alias in meta.aliases:
                    c = copy(c)
                    c.es_index = alias
                    self._upsert_column(c)

        with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, debug=DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns if c.type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(".")
            query_paths.append(ROOT_PATH)

            # ADD RELATIVE COLUMNS
            for abs_column in abs_columns:
                for query_path in query_paths:
                    add_column(abs_column, query_path)

    def query(self, _query):
        return self.meta.columns.query(QueryOp(set_default(
            {
                "from": self.meta.columns,
                "sort": ["table", "name"]
            },
            _query.__data__()
        )))

    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        es_index_name = table_path[0]
        query_path = join_field(table_path[1:])
        table = self.get_table(es_index_name)[0]
        abs_column_name = None if column_name == None else concat_field(query_path, column_name)

        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = Table(
                    name=es_index_name,
                    url=None,
                    query_path=None,
                    timestamp=Date.now()
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=es_index_name)
            elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=es_index_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(es_index_name, column_name)
            if columns:
                columns = jx.sort(columns, "names.\.")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    if DEBUG:
                        Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated])
                    Till(seconds=1).wait()
                return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        if abs_column_name:
            Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name)
        else:
            self._get_columns(table=table_name)  # TO TEST WHAT HAPPENED
            Log.error("no columns for {{table}}?!", table=table_name)

    def _update_cardinality(self, c):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if c.type in STRUCT:
            Log.error("not supported")
        try:
            if c.es_index == "meta.columns":
                with self.meta.columns.locker:
                    partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.columns),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            if c.es_index == "meta.tables":
                with self.meta.columns.locker:
                    partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.tables),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return

            es_index = c.es_index.split(".")[0]
            result = self.default_es.post("/" + es_index + "/_search", data={
                "aggs": {c.names["."]: _counting_query(c)},
                "size": 0
            })
            r = result.aggregations.values()[0]
            count = result.hits.total
            cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count==0 else None)
            if cardinality == None:
                Log.error("logic error")

            query = Data(size=0)
            if cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                if DEBUG:
                    Log.note("{{table}}.{{field}} has {{num}} parts", table=c.es_index, field=c.es_column, num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                if DEBUG:
                    Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            elif len(c.nested_path) != 1:
                query.aggs[literal_field(c.names["."])] = {
                    "nested": {"path": c.nested_path[0]},
                    "aggs": {"_nested": {"terms": {"field": c.es_column, "size": 0}}}
                }
            else:
                query.aggs[literal_field(c.names["."])] = {"terms": {"field": c.es_column, "size": 0}}

            result = self.default_es.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations.values()[0]
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            if DEBUG:
                Log.note("{{field}} has {{parts}}", field=c.name, parts=parts)
            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })
        except Exception as e:
            if "IndexMissingException" in e and c.es_index.startswith(TEST_TABLE_PREFIX):
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": 0,
                            "cardinality": 0,
                            "last_updated": Date.now()
                        },
                        "clear":[
                            "partitions"
                        ],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"names.\\.": ".", "es_index": c.es_index, "es_column": c.es_column}}
                })
                Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=c, cause=e)

    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    with self.meta.columns.locker:
                        old_columns = [
                            c
                            for c in self.meta.columns
                            if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.type not in STRUCT
                        ]
                        if old_columns:
                            if DEBUG:
                                Log.note("Old columns wth dates {{dates|json}}", dates=wrap(old_columns).last_updated)
                            self.todo.extend(old_columns)
                            # TEST CONSISTENCY
                            for c, d in product(list(self.todo.queue), list(self.todo.queue)):
                                if c.es_column == d.es_column and c.es_index == d.es_index and c != d:
                                    Log.error("")
                        else:
                            if DEBUG:
                                Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10*MINUTE).seconds))
                if column:
                    if DEBUG:
                        Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column)
                    if column.type in STRUCT:
                        with self.meta.columns.locker:
                            column.last_updated = Date.now()
                        continue
                    elif column.last_updated >= Date.now()-TOO_OLD:
                        continue
                    try:
                        self._update_cardinality(column)
                        if DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX):
                            Log.note("updated {{column.name}}", column=column)
                    except Exception as e:
                        Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)

    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == THREAD_STOP:
                break

            if not c.last_updated or c.last_updated >= Date.now()-TOO_OLD:
                continue

            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear":[
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })
            if DEBUG:
                Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
Exemplo n.º 13
0
class ElasticsearchMetadata(Namespace):
    """
    MANAGE SNOWFLAKE SCHEMAS FOR EACH OF THE ALIASES FOUND IN THE CLUSTER
    """

    @override
    def __new__(cls, kwargs, *args, **_kwargs):
        es_cluster = elasticsearch.Cluster(kwargs)
        output = known_clusters.get(id(es_cluster))
        if output is None:
            output = object.__new__(cls)
            known_clusters[id(es_cluster)] = output
        return output

    @override
    def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.es_cluster = elasticsearch.Cluster(kwargs=kwargs)

        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.index_to_alias = {}

        self.es_metadata = Null
        self.metadata_last_updated = Date.now() - OLD_METADATA

        self.meta = Data()
        self.meta.columns = ColumnList(URL(self.es_cluster.settings.host).host)

        self.alias_to_query_paths = {
            "meta.columns": [ROOT_PATH],
            "meta.tables": [ROOT_PATH]
        }
        self.alias_last_updated = {
            "meta.columns": Date.now(),
            "meta.tables": Date.now()
        }
        table_columns = metadata_tables()
        self.meta.tables = ListContainer(
            "meta.tables",
            [],
            jx_base.Schema(".", table_columns)
        )
        self.meta.columns.extend(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("not refresh metadata", self.not_monitor)
        return




    @property
    def namespace(self):
        return self.meta.columns.namespace

    @property
    def url(self):
        return self.es_cluster.url / self.default_name.replace(".", "/")

    def _reload_columns(self, table_desc):
        """
        :param alias: A REAL ALIAS (OR NAME OF INDEX THAT HAS NO ALIAS)
        :return:
        """
        # FIND ALL INDEXES OF ALIAS
        es_last_updated = self.es_cluster.metatdata_last_updated

        alias = table_desc.name
        canonical_index = self.es_cluster.get_best_matching_index(alias).index
        es_metadata_update_required = not (table_desc.timestamp < es_last_updated)
        metadata = self.es_cluster.get_metadata(force=es_metadata_update_required)

        props = [
            (self.es_cluster.get_index(index=i, type=t, debug=DEBUG), t, m.properties)
            for i, d in metadata.indices.items()
            if alias in d.aliases
            for t, m in [_get_best_type_from_mapping(d.mappings)]
        ]

        # CONFIRM ALL COLUMNS ARE SAME, FIX IF NOT
        dirty = False
        all_comparisions = list(jx.pairwise(props)) + list(jx.pairwise(jx.reverse(props)))
        # NOTICE THE SAME (index, type, properties) TRIPLE FROM ABOVE
        for (i1, t1, p1), (i2, t2, p2) in all_comparisions:
            diff = elasticsearch.diff_schema(p2, p1)
            if not self.settings.read_only:
                for d in diff:
                    dirty = True
                    i1.add_property(*d)
        meta = self.es_cluster.get_metadata(force=dirty).indices[canonical_index]

        data_type, mapping = _get_best_type_from_mapping(meta.mappings)
        mapping.properties["_id"] = {"type": "string", "index": "not_analyzed"}
        columns = self._parse_properties(alias, mapping)
        table_desc.timestamp = es_last_updated
        return columns

    def _parse_properties(self, alias, mapping):
        abs_columns = elasticsearch.parse_properties(alias, ".", ROOT_PATH, mapping.properties)
        if DEBUG and any(c.cardinality == 0 and c.name != '_id' for c in abs_columns):
            Log.warning(
                "Some columns are not stored in {{url}} {{index|quote}} table:\n{{names}}",
                url=self.es_cluster.url,
                index=alias,
                names=[
                    ".".join((c.es_index, c.name))
                    for c in abs_columns
                    if c.cardinality == 0
                ]
            )

        with Timer("upserting {{num}} columns", {"num": len(abs_columns)}, silent=not DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns if c.es_type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(".")
            query_paths.append(ROOT_PATH)

            # ENSURE ALL TABLES HAVE THE QUERY PATHS SET
            self.alias_to_query_paths[alias] = query_paths
            for i, a in self.index_to_alias.items():
                if a == alias:
                    self.alias_to_query_paths[i] = query_paths

            # ENSURE COLUMN HAS CORRECT jx_type
            # PICK DEEPEST NESTED PROPERTY AS REPRESENTATIVE
            output = []
            best = {}
            for abs_column in abs_columns:
                abs_column.jx_type = jx_type(abs_column)
                if abs_column.jx_type not in STRUCT:
                    clean_name = unnest_path(abs_column.name)
                    other = best.get(clean_name)
                    if other:
                        if len(other.nested_path) < len(abs_column.nested_path):
                            output.remove(other)
                            self.meta.columns.update({"clear": ".", "where": {"eq": {"es_column": other.es_column, "es_index": other.es_index}}})
                        else:
                            continue
                    best[clean_name] = abs_column
                output.append(abs_column)

            # REGISTER ALL COLUMNS
            canonicals = []
            for abs_column in output:
                canonical = self.meta.columns.add(abs_column)
                canonicals.append(canonical)

            self.todo.extend(canonicals)
            return canonicals

    def query(self, _query):
        return self.meta.columns.query(QueryOp(set_default(
            {
                "from": self.meta.columns,
                "sort": ["table", "name"]
            },
            _query.__data__()
        )))

    def _find_alias(self, name):
        if self.metadata_last_updated < self.es_cluster.metatdata_last_updated:
            for a in self.es_cluster.get_aliases():
                self.index_to_alias[a.index] = coalesce(a.alias, a.index)
                self.alias_last_updated.setdefault(a.alias, Date.MIN)
        if name in self.alias_last_updated:
            return name
        else:
            return self.index_to_alias.get(name)

    def get_columns(self, table_name, column_name=None, after=None, timeout=None):
        """
        RETURN METADATA COLUMNS

        :param table_name: TABLE WE WANT COLUMNS FOR
        :param column_name:  OPTIONAL NAME, IF INTERESTED IN ONLY ONE COLUMN
        :param after: FORCE LOAD, WAITING FOR last_updated TO BE AFTER THIS TIME
        :param timeout: Signal; True when should give up
        :return:
        """
        DEBUG and after and Log.note("getting columns for after {{time}}", time=after)
        table_path = split_field(table_name)
        root_table_name = table_path[0]

        alias = self._find_alias(root_table_name)
        if not alias:
            self.es_cluster.get_metadata(force=True)
            alias = self._find_alias(root_table_name)
            if not alias:
                Log.error("{{table|quote}} does not exist", table=table_name)

        try:
            table = self.get_table(alias)[0]
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = TableDesc(
                    name=alias,
                    url=None,
                    query_path=["."],
                    timestamp=Date.MIN
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                columns = self._reload_columns(table)
                DEBUG and Log.note("columns from reload")
            elif after or table.timestamp < self.es_cluster.metatdata_last_updated:
                columns = self._reload_columns(table)
                DEBUG and Log.note("columns from reload")
            else:
                columns = self.meta.columns.find(alias, column_name)
                DEBUG and Log.note("columns from find()")

            DEBUG and Log.note("columns are {{ids}}", ids=[id(c) for c in columns])

            columns = jx.sort(columns, "name")

            if after is None:
                return columns  # DO NOT WAIT FOR COMPLETE COLUMNS

            # WAIT FOR THE COLUMNS TO UPDATE
            while True:
                pending = [c for c in columns if after >= c.last_updated or (c.cardinality == None and c.jx_type not in STRUCT)]
                if not pending:
                    break
                if timeout:
                    Log.error("trying to gets columns timed out")
                if DEBUG:
                    if len(pending) > 10:
                        Log.note("waiting for {{num}} columns to update by {{timestamp}}", num=len(pending), timestamp=after)
                    else:
                        Log.note("waiting for columns to update by {{timestamp}}; {{columns|json}}", timestamp=after, columns=[c.es_index + "." + c.es_column + " id="+text_type(id(c)) for c in pending])
                Till(seconds=1).wait()
            return columns
        except Exception as e:
            Log.error("Failure to get columns for {{table}}", table=table_name, cause=e)

        return []

    def _update_cardinality(self, column):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        now = Date.now()
        if column.es_index in self.index_does_not_exist:
            return

        if column.jx_type in STRUCT:
            Log.error("not supported")
        try:
            if column.es_index == "meta.columns":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.columns, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.columns),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            if column.es_index == "meta.tables":
                partitions = jx.sort([g[column.es_column] for g, _ in jx.groupby(self.meta.tables, column.es_column) if g[column.es_column] != None])
                self.meta.columns.update({
                    "set": {
                        "partitions": partitions,
                        "count": len(self.meta.tables),
                        "cardinality": len(partitions),
                        "multi": 1,
                        "last_updated": now
                    },
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return

            es_index = column.es_index.split(".")[0]

            is_text = [cc for cc in self.meta.columns if cc.es_column == column.es_column and cc.es_type == "text"]
            if is_text:
                # text IS A MULTIVALUE STRING THAT CAN ONLY BE FILTERED
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": {"filter": {"match_all": {}}}
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = max(1001, count)
                multi = 1001
            elif column.es_column == "_id":
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "query": {"match_all": {}},
                    "size": 0
                })
                count = cardinality = result.hits.total
                multi = 1
            elif column.es_type == BOOLEAN:
                result = self.es_cluster.post("/" + es_index + "/_search", data={
                    "aggs": {
                        "count": _counting_query(column)
                    },
                    "size": 0
                })
                count = result.hits.total
                cardinality = 2

                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": [False, True],
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            else:
                es_query = {
                    "aggs": {
                        "count": _counting_query(column),
                        "_filter": {
                            "aggs": {"multi": {"max": {"script": "doc[" + quote(column.es_column) + "].values.size()"}}},
                            "filter": {"bool": {"should": [
                                {"range": {"etl.timestamp.~n~": {"gte": (Date.today() - WEEK)}}},
                                {"bool": {"must_not": {"exists": {"field": "etl.timestamp.~n~"}}}}
                            ]}}
                        }
                    },
                    "size": 0
                }

                result = self.es_cluster.post("/" + es_index + "/_search", data=es_query)
                agg_results = result.aggregations
                count = result.hits.total
                cardinality = coalesce(agg_results.count.value, agg_results.count._nested.value, agg_results.count.doc_count)
                multi = int(coalesce(agg_results._filter.multi.value, 1))
                if cardinality == None:
                    Log.error("logic error")

            query = Data(size=0)

            if column.es_column == "_id":
                self.meta.columns.update({
                    "set": {
                        "count": cardinality,
                        "cardinality": cardinality,
                        "multi": 1,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif column.es_type in elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                DEBUG and Log.note("{{table}}.{{field}} has {{num}} parts", table=column.es_index, field=column.es_column, num=cardinality)
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "multi": multi,
                        "last_updated": now
                    },
                    "clear": ["partitions"],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                return
            elif len(column.nested_path) != 1:
                query.aggs["_"] = {
                    "nested": {"path": column.nested_path[0]},
                    "aggs": {"_nested": {"terms": {"field": column.es_column}}}
                }
            elif cardinality == 0:  # WHEN DOES THIS HAPPEN?
                query.aggs["_"] = {"terms": {"field": column.es_column}}
            else:
                query.aggs["_"] = {"terms": {"field": column.es_column, "size": cardinality}}

            result = self.es_cluster.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations._
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            DEBUG and Log.note("update metadata for {{column.es_index}}.{{column.es_column}} (id={{id}}) at {{time}}", id=id(column), column=column, time=now)
            self.meta.columns.update({
                "set": {
                    "count": count,
                    "cardinality": cardinality,
                    "multi": multi,
                    "partitions": parts,
                    "last_updated": now
                },
                "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
            })
        except Exception as e:
            # CAN NOT IMPORT: THE TEST MODULES SETS UP LOGGING
            # from tests.test_jx import TEST_TABLE
            e = Except.wrap(e)
            TEST_TABLE = "testdata"
            is_missing_index = any(w in e for w in ["IndexMissingException", "index_not_found_exception"])
            is_test_table = column.es_index.startswith((TEST_TABLE_PREFIX, TEST_TABLE))
            if is_missing_index:
                # WE EXPECT TEST TABLES TO DISAPPEAR
                Log.warning("Missing index {{col.es_index}}", col=column, cause=e)
                self.meta.columns.update({
                    "clear": ".",
                    "where": {"eq": {"es_index": column.es_index}}
                })
                self.index_does_not_exist.add(column.es_index)
            elif "No field found for" in e:
                self.meta.columns.update({
                    "clear": ".",
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                Log.warning("Could not get column {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": now
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "multi",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                })
                Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=column, cause=e)

    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    old_columns = [
                        c
                        for c in self.meta.columns
                        if ((c.last_updated < Date.now() - MAX_COLUMN_METADATA_AGE) or c.cardinality == None) and c.jx_type not in STRUCT
                    ]
                    if old_columns:
                        DEBUG and Log.note(
                            "Old columns {{names|json}} last updated {{dates|json}}",
                            names=wrap(old_columns).es_column,
                            dates=[Date(t).format() for t in wrap(old_columns).last_updated]
                        )
                        self.todo.extend(old_columns)
                    else:
                        DEBUG and Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10*MINUTE).seconds))
                if column:
                    if column is THREAD_STOP:
                        continue

                    with Timer("update {{table}}.{{column}}", param={"table": column.es_index, "column": column.es_column}, silent=not DEBUG):
                        if column.es_index in self.index_does_not_exist:
                            DEBUG and Log.note("{{column.es_column}} does not exist", column=column)
                            self.meta.columns.update({
                                "clear": ".",
                                "where": {"eq": {"es_index": column.es_index}}
                            })
                            continue
                        if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE:
                            DEBUG and Log.note("{{column.es_column}} is a struct", column=column)
                            column.last_updated = Date.now()
                            continue
                        elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None:
                            # DO NOT UPDATE FRESH COLUMN METADATA
                            DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds)
                            continue
                        try:
                            self._update_cardinality(column)
                            (DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX)) and Log.note("updated {{column.name}}", column=column)
                        except Exception as e:
                            if '"status":404' in e:
                                self.meta.columns.update({
                                    "clear": ".",
                                    "where": {"eq": {"es_index": column.es_index, "es_column": column.es_column}}
                                })
                            else:
                                Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)

    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            column = self.todo.pop()
            if column == THREAD_STOP:
                break

            if column.jx_type in STRUCT or split_field(column.es_column)[-1] == EXISTS_TYPE:
                DEBUG and Log.note("{{column.es_column}} is a struct", column=column)
                column.last_updated = Date.now()
                continue
            elif column.last_updated > Date.now() - TOO_OLD and column.cardinality is not None:
                # DO NOT UPDATE FRESH COLUMN METADATA
                DEBUG and Log.note("{{column.es_column}} is still fresh ({{ago}} ago)", column=column, ago=(Date.now()-Date(column.last_updated)).seconds)
                continue

            with Timer("Update {{col.es_index}}.{{col.es_column}}", param={"col": column}, silent=not DEBUG, too_long=0.05):
                if untype_path(column.name) in ["build.type", "run.type"]:
                    try:
                        self._update_cardinality(column)
                    except Exception as e:
                        Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
                else:
                    column.last_updated = Date.now()


    def get_table(self, name):
        if name == "meta.columns":
            return self.meta.columns

        with self.meta.tables.locker:
            return wrap([t for t in self.meta.tables.data if t.name == name])

    def get_snowflake(self, fact_table_name):
        return Snowflake(fact_table_name, self)

    def get_schema(self, name):
        if name == "meta.columns":
            return self.meta.columns.schema
        if name == "meta.tables":
            return self.meta.tables
        root, rest = tail_field(name)
        return self.get_snowflake(root).get_schema(rest)