Exemplo n.º 1
0
    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == THREAD_STOP:
                break

            if not c.last_updated or c.last_updated >= Date.now()-TOO_OLD:
                continue

            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear":[
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })
            if DEBUG:
                Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
Exemplo n.º 2
0
    def _db_load(self):
        self.last_load = Date.now()

        result = self._query(
            SQL_SELECT
            + "name"
            + SQL_FROM
            + "sqlite_master"
            + SQL_WHERE
            + SQL_AND.join(["name=" + db_table_name, "type=" + quote_value("table")])
        )
        if not result.data:
            self._db_create()
            return

        result = self._query(
            SQL_SELECT
            + all_columns
            + SQL_FROM
            + db_table_name
            + SQL_ORDERBY
            + sql_list(map(quote_column, ["es_index", "name", "es_column"]))
        )

        with self.locker:
            for r in result.data:
                c = row_to_column(result.header, r)
                self._add(c)
Exemplo n.º 3
0
    def _upsert_column(self, c):
        # ASSUMING THE  self.meta.columns.locker IS HAD
        existing_columns = self.meta.columns.find(c.es_index, c.names["."])
        if not existing_columns:
            self.meta.columns.add(c)
            self.todo.add(c)

            if ENABLE_META_SCAN:
                if DEBUG:
                    Log.note("todo: {{table}}::{{column}}", table=c.es_index, column=c.es_column)
                # MARK meta.columns AS DIRTY TOO
                cols = self.meta.columns.find("meta.columns", None)
                for cc in cols:
                    cc.partitions = cc.cardinality = None
                    cc.last_updated = Date.now()
                self.todo.extend(cols)
        else:
            canonical = existing_columns[0]
            if canonical is not c:
                set_default(c.names, canonical.names)
                for key in Column.__slots__:
                    canonical[key] = c[key]
            if DEBUG:
                Log.note("todo: {{table}}::{{column}}", table=canonical.es_index, column=canonical.es_column)
            self.todo.add(canonical)
Exemplo n.º 4
0
def get_branches(hg, branches, kwargs=None):
    # TRY ES
    cluster = elasticsearch.Cluster(branches)
    try:
        es = cluster.get_index(kwargs=branches, read_only=False)
        esq = jx_elasticsearch.new_instance(branches)
        found_branches = esq.query({"from": "branches", "format": "list", "limit": 10000}).data

        # IF IT IS TOO OLD, THEN PULL FROM HG
        oldest = Date(MAX(found_branches.etl.timestamp))
        if oldest == None or Date.now() - oldest > OLD_BRANCH:
            found_branches = _get_branches_from_hg(hg)
            es.extend({"id": b.name + " " + b.locale, "value": b} for b in found_branches)
            es.flush()

        try:
            return UniqueIndex(["name", "locale"], data=found_branches, fail_on_dup=False)
        except Exception as e:
            Log.error("Bad branch in ES index", cause=e)
    except Exception as e:
        e = Except.wrap(e)
        if "Can not find index " in e:
            set_default(branches, {"schema": branches_schema})
            es = cluster.get_or_create_index(branches)
            es.add_alias()
            return get_branches(kwargs)
        Log.error("problem getting branches", cause=e)
Exemplo n.º 5
0
    def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(kwargs=kwargs)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.last_es_metadata = Date.now()-OLD_METADATA

        self.meta=Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Exemplo n.º 6
0
    def _update_meta(self):
        if not self.dirty:
            return

        for mcl in self.data.get("meta.columns").values():
            for mc in mcl:
                count = 0
                values = set()
                objects = 0
                multi = 1
                for column in self._all_columns():
                    value = column[mc.name]
                    if value == None:
                        pass
                    else:
                        count += 1
                        if is_list(value):
                            multi = max(multi, len(value))
                            try:
                                values |= set(value)
                            except Exception:
                                objects += len(value)
                        elif is_data(value):
                            objects += 1
                        else:
                            values.add(value)
                mc.count = count
                mc.cardinality = len(values) + objects
                mc.partitions = jx.sort(values)
                mc.multi = multi
                mc.last_updated = Date.now()
        self.dirty = False
Exemplo n.º 7
0
    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        es_index_name = table_path[0]
        query_path = join_field(table_path[1:])
        table = self.get_table(es_index_name)[0]
        abs_column_name = None if column_name == None else concat_field(query_path, column_name)

        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = Table(
                    name=es_index_name,
                    url=None,
                    query_path=None,
                    timestamp=Date.now()
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=es_index_name)
            elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=es_index_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(es_index_name, column_name)
            if columns:
                columns = jx.sort(columns, "names.\.")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    if DEBUG:
                        Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated])
                    Till(seconds=1).wait()
                return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        if abs_column_name:
            Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name)
        else:
            self._get_columns(table=table_name)  # TO TEST WHAT HAPPENED
            Log.error("no columns for {{table}}?!", table=table_name)
Exemplo n.º 8
0
        def add_column(c, query_path):
            c.last_updated = Date.now()
            if query_path[0] != ".":
                c.names[query_path[0]] = relative_field(c.names["."], query_path[0])

            with self.meta.columns.locker:
                self._upsert_column(c)
                for alias in meta.aliases:
                    c = copy(c)
                    c.es_index = alias
                    self._upsert_column(c)
Exemplo n.º 9
0
    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    with self.meta.columns.locker:
                        old_columns = [
                            c
                            for c in self.meta.columns
                            if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.type not in STRUCT
                        ]
                        if old_columns:
                            if DEBUG:
                                Log.note("Old columns wth dates {{dates|json}}", dates=wrap(old_columns).last_updated)
                            self.todo.extend(old_columns)
                            # TEST CONSISTENCY
                            for c, d in product(list(self.todo.queue), list(self.todo.queue)):
                                if c.es_column == d.es_column and c.es_index == d.es_index and c != d:
                                    Log.error("")
                        else:
                            if DEBUG:
                                Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10*MINUTE).seconds))
                if column:
                    if DEBUG:
                        Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column)
                    if column.type in STRUCT:
                        with self.meta.columns.locker:
                            column.last_updated = Date.now()
                        continue
                    elif column.last_updated >= Date.now()-TOO_OLD:
                        continue
                    try:
                        self._update_cardinality(column)
                        if DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX):
                            Log.note("updated {{column.name}}", column=column)
                    except Exception as e:
                        Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)
Exemplo n.º 10
0
    def _get_columns(self, table=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        table_path = split_field(table)
        es_index = table_path[0]
        query_path = join_field(table_path[1:])
        meta = self.es_metadata.indices[es_index]
        if not meta or self.last_es_metadata < Date.now() - OLD_METADATA:
            self.es_metadata = self.default_es.get_metadata(force=True)
            meta = self.es_metadata.indices[es_index]

        for _, properties in meta.mappings.items():
            properties.properties["_id"] = {"type": "string", "index": "not_analyzed"}
            self._parse_properties(meta.index, properties, meta)
Exemplo n.º 11
0
def test_mode_wait(query, please_stop):
    """
    WAIT FOR METADATA TO ARRIVE ON INDEX
    :param query: dict() OF REQUEST BODY
    :return: nothing
    """

    if not query["from"]:
        return

    try:
        if query["from"].startswith("meta."):
            return

        alias = split_field(query["from"])[0]
        after = Date.now()
        require_cardinality = meta.ENABLE_META_SCAN
        with Timer(
                "Get columns for {{table}} after {{after}}",
            {
                "table": alias,
                "after": after
            },
                verbose=DEBUG,
        ):
            metadata_manager = find_container(alias, after=after).namespace

            timeout = Till(seconds=MINUTE.seconds) | please_stop
            while not timeout:
                # GET FRESH VERSIONS
                cols = metadata_manager.get_columns(table_name=alias,
                                                    after=after,
                                                    timeout=timeout)
                not_ready = [
                    c for c in cols if c.jx_type not in STRUCT and (
                        after >= c.last_updated or
                        (require_cardinality and c.cardinality == None))
                ]
                if not_ready:
                    Log.note(
                        "wait for column (table={{col.es_index}}, name={{col.es_column}}, cardinality={{col.cardinality|json}}, last_updated={{col.last_updated|datetime}}) metadata to arrive",
                        col=first(not_ready),
                    )
                else:
                    break
                Till(seconds=1).wait()
    except Exception as e:
        Log.warning("could not pickup columns", cause=e)
Exemplo n.º 12
0
    def _db_load(self):
        self.last_load = Date.now()

        try:
            self.es_index = self.es_cluster.get_index(
                id=ID,
                index=META_COLUMNS_NAME,
                type=META_COLUMNS_TYPE_NAME,
                read_only=False)

            result = self.es_index.search({
                "query": {
                    "bool": {
                        "should": [
                            {
                                "bool": {
                                    "must_not": {
                                        "exists": {
                                            "field": "cardinality.~n~"
                                        }
                                    }
                                }
                            },
                            {  # ASSUME UNUSED COLUMNS DO NOT EXIST
                                "range": {
                                    "cardinality.~n~": {
                                        "gt": 0
                                    }
                                }
                            },
                        ]
                    }
                },
                "sort": ["es_index.~s~", "name.~s~", "es_column.~s~"],
                "size":
                10000,
            })

            Log.note("{{num}} columns loaded", num=result.hits.total)
            with self.locker:
                for r in result.hits.hits._source:
                    self._add(doc_to_column(r))

        except Exception as e:
            Log.warning("no {{index}} exists, making one",
                        index=META_COLUMNS_NAME,
                        cause=e)
            self._db_create()
Exemplo n.º 13
0
 def setUpClass(self):
     # REMOVE OLD INDEXES
     cluster = elasticsearch.Cluster(test_jx.global_settings.backend_es)
     aliases = cluster.get_aliases()
     for a in aliases:
         try:
             if a.index.startswith("testing_"):
                 create_time = Date(
                     a.index[-15:], "%Y%m%d_%H%M%S"
                 )  # EXAMPLE testing_0ef53e45b320160118_180420
                 if create_time < Date.now() - 10 * MINUTE:
                     cluster.delete_index(a.index)
         except Exception, e:
             Log.warning("Problem removing {{index|quote}}",
                         index=a.index,
                         cause=e)
Exemplo n.º 14
0
    def _get_columns(self, table=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        table_path = split_field(table)
        es_index = table_path[0]
        query_path = join_field(table_path[1:])
        meta = self.es_metadata.indices[es_index]
        if not meta or self.last_es_metadata < Date.now() - OLD_METADATA:
            self.es_metadata = self.default_es.get_metadata(force=True)
            meta = self.es_metadata.indices[es_index]

        for _, properties in meta.mappings.items():
            properties.properties["_id"] = {
                "type": "string",
                "index": "not_analyzed"
            }
            self._parse_properties(meta.index, properties, meta)
Exemplo n.º 15
0
    def _daemon(self, please_stop):
        while not please_stop:
            with Explanation("looking for work"):
                try:
                    branch, revisions = self.todo.pop(till=please_stop)
                except Exception as e:
                    if please_stop:
                        break
                    else:
                        raise e
                if branch.name in DAEMON_DO_NO_SCAN:
                    continue
                revisions = set(revisions)

                # FIND THE REVSIONS ON THIS BRANCH
                for r in list(revisions):
                    try:
                        rev = self.get_revision(
                            Revision(branch=branch, changeset={"id": r}))
                        if DAEMON_DEBUG:
                            Log.note(
                                "found revision with push date {{date|datetime}}",
                                date=rev.push.date)
                        revisions.discard(r)

                        if rev.etl.timestamp > Date.now(
                        ) - DAEMON_RECENT_HG_PULL:
                            # SOME PUSHES ARE BIG, RUNNING THE RISK OTHER MACHINES ARE
                            # ALSO INTERESTED AND PERFORMING THE SAME SCAN. THIS DELAY
                            # WILL HAVE SMALL EFFECT ON THE MAJORITY OF SMALL PUSHES
                            # https://bugzilla.mozilla.org/show_bug.cgi?id=1417720
                            Till(seconds=Random.float(
                                DAEMON_HG_INTERVAL).seconds * 2).wait()

                    except Exception as e:
                        Log.warning(
                            "Scanning {{branch}} {{revision|left(12)}}",
                            branch=branch.name,
                            revision=r,
                            cause=e)
                        if "Read timed out" in e:
                            Till(seconds=DAEMON_WAIT_AFTER_TIMEOUT.seconds
                                 ).wait()

                # FIND ANY BRANCH THAT MAY HAVE THIS REVISION
                for r in list(revisions):
                    self._find_revision(r)
Exemplo n.º 16
0
    def output(*args, **kwargs):
        if kwargs:
            Log.error("Sorry, caching only works with ordered parameter, not keyword arguments")

        with cache_store.locker:
            if using_self:
                self = args[0]
                args = args[1:]
            else:
                self = cache_store

            now = Date.now()
            try:
                _cache = getattr(self, attr_name)
            except Exception:
                _cache = {}
                setattr(self, attr_name, _cache)

            if Random.int(100) == 0:
                # REMOVE OLD CACHE
                _cache = {k: v for k, v in _cache.items() if v.timeout == None or v.timeout > now}
                setattr(self, attr_name, _cache)

            timeout, key, value, exception = _cache.get(args, (Null, Null, Null, Null))

        if now >= timeout:
            value = func(self, *args)
            with cache_store.locker:
                _cache[args] = CacheElement(now + cache_store.timeout, args, value, None)
            return value

        if value == None:
            if exception == None:
                try:
                    value = func(self, *args)
                    with cache_store.locker:
                        _cache[args] = CacheElement(now + cache_store.timeout, args, value, None)
                    return value
                except Exception as e:
                    e = Except.wrap(e)
                    with cache_store.locker:
                        _cache[args] = CacheElement(now + cache_store.timeout, args, None, e)
                    raise e
            else:
                raise exception
        else:
            return value
Exemplo n.º 17
0
    def _load_from_database(self):
        # FIND ALL TABLES
        result = self.db.query(sql_query({
            "from": "sqlite_master",
            "where": {"eq": {"type": "table"}},
            "orderby": "name"
        }))
        tables = wrap([{k: d for k, d in zip(result.header, row)} for row in result.data])
        last_nested_path = ["."]
        for table in tables:
            if table.name.startswith("__"):
                continue
            base_table, nested_path = tail_field(table.name)

            # FIND COMMON NESTED PATH SUFFIX
            if nested_path == ".":
                last_nested_path = []
            else:
                for i, p in enumerate(last_nested_path):
                    if startswith_field(nested_path, p):
                        last_nested_path = last_nested_path[i:]
                        break
                else:
                    last_nested_path = []

            full_nested_path = [nested_path] + last_nested_path
            self._snowflakes[literal_field(base_table)] += [full_nested_path]

            # LOAD THE COLUMNS
            details = self.db.about(table.name)

            for cid, name, dtype, notnull, dfft_value, pk in details:
                if name.startswith("__"):
                    continue
                cname, ctype = untyped_column(name)
                self.add(Column(
                    name=cname,
                    jx_type=coalesce(sql_type_to_json_type.get(ctype), IS_NULL),
                    nested_path=full_nested_path,
                    es_type=dtype,
                    es_column=name,
                    es_index=table.name,
                    last_updated=Date.now()
                ))
            last_nested_path = full_nested_path
Exemplo n.º 18
0
    def __init__(
            self,
            hg=None,  # CONNECT TO hg
            repo=None,  # CONNECTION INFO FOR ES CACHE
            branches=None,  # CONNECTION INFO FOR ES CACHE
            use_cache=False,  # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES
            timeout=30 * SECOND,
            kwargs=None):
        if not _hg_branches:
            _late_imports()

        self.es_locker = Lock()
        self.todo = mo_threads.Queue("todo for hg daemon",
                                     max=DAEMON_QUEUE_SIZE)

        self.settings = kwargs
        self.timeout = Duration(timeout)

        # VERIFY CONNECTIVITY
        with Explanation("Test connect with hg"):
            response = http.head(self.settings.hg.url)

        if branches == None:
            self.branches = _hg_branches.get_branches(kwargs=kwargs)
            self.es = None
            return

        self.last_cache_miss = Date.now()

        set_default(repo, {"schema": revision_schema})
        self.es = elasticsearch.Cluster(kwargs=repo).get_or_create_index(
            kwargs=repo)

        def setup_es(please_stop):
            with suppress_exception:
                self.es.add_alias()

            with suppress_exception:
                self.es.set_refresh_interval(seconds=1)

        Thread.run("setup_es", setup_es)
        self.branches = _hg_branches.get_branches(kwargs=kwargs)
        self.timeout = timeout
        Thread.run("hg daemon", self._daemon)
Exemplo n.º 19
0
    def __init__(
        self,
        hg=None,        # CONNECT TO hg
        repo=None,      # CONNECTION INFO FOR ES CACHE
        branches=None,  # CONNECTION INFO FOR ES CACHE
        use_cache=False,   # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES
        timeout=30 * SECOND,
        kwargs=None
    ):
        if not _hg_branches:
            _late_imports()

        self.es_locker = Lock()
        self.todo = mo_threads.Queue("todo for hg daemon", max=DAEMON_QUEUE_SIZE)

        self.settings = kwargs
        self.timeout = Duration(timeout)

        # VERIFY CONNECTIVITY
        with Explanation("Test connect with hg"):
            response = http.head(self.settings.hg.url)

        if branches == None:
            self.branches = _hg_branches.get_branches(kwargs=kwargs)
            self.es = None
            return

        self.last_cache_miss = Date.now()

        set_default(repo, {"schema": revision_schema})
        self.es = elasticsearch.Cluster(kwargs=repo).get_or_create_index(kwargs=repo)

        def setup_es(please_stop):
            with suppress_exception:
                self.es.add_alias()

            with suppress_exception:
                self.es.set_refresh_interval(seconds=1)

        Thread.run("setup_es", setup_es)
        self.branches = _hg_branches.get_branches(kwargs=kwargs)
        self.timeout = timeout
        Thread.run("hg daemon", self._daemon)
Exemplo n.º 20
0
    def _db_load(self):
        self.last_load = Date.now()

        result = self._query(
            SQL_SELECT + "name" + SQL_FROM + "sqlite_master" + SQL_WHERE +
            SQL_AND.join(
                ["name=" + db_table_name, "type=" + quote_value("table")]))
        if not result.data:
            self._db_create()
            return

        result = self._query(
            SQL_SELECT + all_columns + SQL_FROM + db_table_name + SQL_ORDERBY +
            sql_list(map(quote_column, ["es_index", "name", "es_column"])))

        with self.locker:
            for r in result.data:
                c = row_to_column(result.header, r)
                self._add(c)
Exemplo n.º 21
0
    def _daemon(self, please_stop):
        while not please_stop:
            with Explanation("looking for work"):
                try:
                    branch, revisions = self.todo.pop(till=please_stop)
                except Exception as e:
                    if please_stop:
                        break
                    else:
                        raise e
                if branch.name in DAEMON_DO_NO_SCAN:
                    continue
                revisions = set(revisions)

                # FIND THE REVSIONS ON THIS BRANCH
                for r in list(revisions):
                    try:
                        rev = self.get_revision(Revision(branch=branch, changeset={"id": r}))
                        if DAEMON_DEBUG:
                            Log.note("found revision with push date {{date|datetime}}", date=rev.push.date)
                        revisions.discard(r)

                        if rev.etl.timestamp > Date.now() - (DAEMON_RECENT_HG_PULL * SECOND):
                            # SOME PUSHES ARE BIG, RUNNING THE RISK OTHER MACHINES ARE
                            # ALSO INTERESTED AND PERFORMING THE SAME SCAN. THIS DELAY
                            # WILL HAVE SMALL EFFECT ON THE MAJORITY OF SMALL PUSHES
                            # https://bugzilla.mozilla.org/show_bug.cgi?id=1417720
                            Till(seconds=Random.float(DAEMON_HG_INTERVAL*2)).wait()

                    except Exception as e:
                        Log.warning(
                            "Scanning {{branch}} {{revision|left(12)}}",
                            branch=branch.name,
                            revision=r,
                            cause=e
                        )
                        if "Read timed out" in e:
                            Till(seconds=DAEMON_WAIT_AFTER_TIMEOUT).wait()


                # FIND ANY BRANCH THAT MAY HAVE THIS REVISION
                for r in list(revisions):
                    self._find_revision(r)
Exemplo n.º 22
0
    def _load_from_database(self):
        # FIND ALL TABLES
        result = self.db.query(
            "SELECT * FROM sqlite_master WHERE type='table' ORDER BY name")
        tables = wrap([{k: d
                        for k, d in zip(result.header, row)}
                       for row in result.data])
        last_nested_path = []
        for table in tables:
            if table.name.startswith("__"):
                continue
            base_table, nested_path = tail_field(table.name)

            # FIND COMMON NESTED PATH SUFFIX
            for i, p in enumerate(last_nested_path):
                if startswith_field(nested_path, p):
                    last_nested_path = last_nested_path[i:]
                    break
            else:
                last_nested_path = []

            full_nested_path = [nested_path] + last_nested_path
            self._snowflakes[literal_field(base_table)] += [full_nested_path]

            # LOAD THE COLUMNS
            command = "PRAGMA table_info" + sql_iso(quote_column(table.name))
            details = self.db.query(command)

            for cid, name, dtype, notnull, dfft_value, pk in details.data:
                if name.startswith("__"):
                    continue
                cname, ctype = untyped_column(name)
                self.add(
                    Column(name=cname,
                           jx_type=coalesce(sql_type_to_json_type.get(ctype),
                                            IS_NULL),
                           nested_path=full_nested_path,
                           es_type=dtype,
                           es_column=name,
                           es_index=table.name,
                           last_updated=Date.now()))
            last_nested_path = full_nested_path
Exemplo n.º 23
0
    def output(*args):
        with cache_store.locker:
            if using_self:
                self = args[0]
                args = args[1:]
            else:
                self = cache_store

            now = Date.now()
            try:
                _cache = getattr(self, attr_name)
            except Exception, _:
                _cache = {}
                setattr(self, attr_name, _cache)

            if Random.int(100) == 0:
                # REMOVE OLD CACHE
                _cache = {k: v for k, v in _cache.items() if v[0]==None or v[0] > now}
                setattr(self, attr_name, _cache)

            timeout, key, value, exception = _cache.get(args, (Null, Null, Null, Null))
Exemplo n.º 24
0
def test_mode_wait(query):
    """
    WAIT FOR METADATA TO ARRIVE ON INDEX
    :param query: dict() OF REQUEST BODY
    :return: nothing
    """

    if not query["from"]:
        return

    try:
        if query["from"].startswith("meta."):
            return

        now = Date.now()
        alias = split_field(query["from"])[0]
        metadata_manager = find_container(alias).namespace
        metadata_manager.meta.tables[
            alias].timestamp = now  # TRIGGER A METADATA RELOAD AFTER THIS TIME

        timeout = Till(seconds=MINUTE.seconds)
        while not timeout:
            # GET FRESH VERSIONS
            cols = [
                c for c in metadata_manager.get_columns(
                    table_name=alias, after=now, timeout=timeout)
                if c.jx_type not in STRUCT
            ]
            for c in cols:
                if now >= c.last_updated:
                    Log.note(
                        "wait for column (table={{col.es_index}}, name={{col.es_column}}) metadata to arrive",
                        col=c)
                    break
            else:
                break
            Till(seconds=1).wait()
    except Exception as e:
        Log.warning("could not pickup columns", cause=e)
Exemplo n.º 25
0
    def __init__(
        self,
        service_url,    # location of the ActiveData server we are testing
        backend_es,     # the ElasticSearch settings for filling the backend
        sql_url=None,   # location of the SQL service
        fast_testing=False,
        kwargs=None
    ):
        if backend_es.schema==None:
            Log.error("Expecting backed_es to have a schema defined")

        letters = unicode(ascii_lowercase)
        self.random_letter = letters[int(Date.now().unix / 30) % 26]
        self.service_url = service_url
        self.sql_url = sql_url
        self.backend_es = backend_es
        self.settings = kwargs
        self._es_test_settings = None
        self._es_cluster = None
        self._index = None

        if not containers.config.default:
            containers.config.default = {
                "type": "elasticsearch",
                "settings": backend_es
            }

        if not fast_testing:
            self.server = http
        else:
            Log.alert("TESTS WILL RUN FAST, BUT NOT ALL TESTS ARE RUN!\nEnsure the `file://tests/config/elasticsearch.json#fastTesting=true` to turn on the network response tests.")
            # WE WILL USE THE ActiveServer CODE, AND CONNECT TO ES DIRECTLY.
            # THIS MAKES FOR SLIGHTLY FASTER TEST TIMES BECAUSE THE PROXY IS
            # MISSING
            self.server = FakeHttp()
            containers.config.default = {
                "type": "elasticsearch",
                "settings": kwargs.backend_es.copy()
            }
Exemplo n.º 26
0
    def __init__(self,
                 host,
                 index,
                 alias=None,
                 name=None,
                 port=9200,
                 kwargs=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(kwargs=kwargs)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.last_es_metadata = Date.now() - OLD_METADATA

        self.meta = Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer(
            "meta.tables", [], wrap({c.names["."]: c
                                     for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Exemplo n.º 27
0
def get_branches(hg, branches, kwargs=None):
    # TRY ES
    cluster = elasticsearch.Cluster(branches)
    try:
        es = cluster.get_index(kwargs=branches, read_only=False)
        esq = jx_elasticsearch.new_instance(branches)
        found_branches = esq.query({
            "from": branches.index,
            "format": "list",
            "limit": 10000
        }).data

        # IF IT IS TOO OLD, THEN PULL FROM HG
        oldest = Date(MAX(found_branches.etl.timestamp))
        if oldest == None or Date.now() - oldest > OLD_BRANCH:
            found_branches = _get_branches_from_hg(hg)
            es.extend([{
                "id": b.name + " " + b.locale,
                "value": b
            } for b in found_branches])
            es.flush()

        try:
            return UniqueIndex(["name", "locale"],
                               data=found_branches,
                               fail_on_dup=False)
        except Exception as e:
            Log.error("Bad branch in ES index", cause=e)
    except Exception as e:
        e = Except.wrap(e)
        if "Can not find index " in e:
            branches.schema = branches_schema
            es = cluster.get_or_create_index(branches)
            es.add_alias()
            return get_branches(kwargs)
        Log.error("problem getting branches", cause=e)
Exemplo n.º 28
0
    def test_chunk_timing(self):
        if self.not_real_service():
            return

        test = wrap({"query": {
            "from": {
                "type": "elasticsearch",
                "settings": {
                    "host": ES_CLUSTER_LOCATION,
                    "index": "unittest",
                    "type": "test_result"
                }
            },
            "select": {"value": "run.stats.duration", "aggregate": "average"},
            "edges": [
                {"name": "chunk", "value": ["run.suite", "run.chunk"]}
            ],
            "where": {"and": [
                {"term": {"etl.id": 0}},
                {"gte": {"timestamp": Date.floor(Date.now() - (Duration.DAY * 7), Duration.DAY).milli / 1000}}
            ]},
            "format": "cube",
            "samples": {
                "limit": 30
            }
        }})

        query = convert.unicode2utf8(convert.value2json(test.query))
        # EXECUTE QUERY
        with Timer("query"):
            response = self.utils.try_till_response(self.service_url, data=query)
            if response.status_code != 200:
                error(response)
        result = convert.json2value(convert.utf82unicode(response.all_content))

        Log.note("result\n{{result|indent}}", {"result": result})
Exemplo n.º 29
0
    def __init__(self,
                 host,
                 index,
                 alias=None,
                 name=None,
                 port=9200,
                 kwargs=None):
        if hasattr(self, "settings"):
            return

        self.too_old = TOO_OLD
        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = elasticsearch.Cluster(kwargs=kwargs)
        self.index_does_not_exist = set()
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.abs_columns = set()
        self.last_es_metadata = Date.now() - OLD_METADATA

        self.meta = Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer(
            "meta.tables", [], wrap({c.names["."]: c
                                     for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Exemplo n.º 30
0
 def work(please_stop):
     started.go()
     while not please_stop:
         acc.append(Date.now().unix)
         Till(seconds=0.1).wait()
Exemplo n.º 31
0
        })

        try:
            query = wrap(result.data).query
            if len(query) == 0:
                return None
        except Exception, e:
            return None

        self.es.update({
            "update": {
                "type": "elasticsearch",
                "settings": self.es.settings
            },
            "set": {
                "last_used": Date.now()
            },
            "where": {
                "eq": {
                    "hash": hash
                }
            }
        })

        return query[0]

    def save(self, query):
        query.meta = None
        json = convert.value2json(query)
        hash = convert.unicode2utf8(json)
Exemplo n.º 32
0
def _get_schema_from_list(
    frum,  # The list
    table_name,  # Name of the table this list holds records for
    parent,  # parent path
    nested_path,  # each nested array, in reverse order
    columns,  # map from full name to column definition
    native_type_to_json_type  # dict from storage type name to json type name
):
    for d in frum:
        row_type = python_type_to_json_type[d.__class__]

        if row_type != "object":
            # EXPECTING PRIMITIVE VALUE
            full_name = parent
            column = columns[full_name]
            if not column:
                column = Column(
                    name=concat_field(table_name, full_name),
                    es_column=full_name,
                    es_index=".",
                    es_type=d.__class__.__name__,
                    jx_type=None,  # WILL BE SET BELOW
                    last_updated=Date.now(),
                    nested_path=nested_path,
                )
                columns.add(column)
            column.es_type = _merge_python_type(column.es_type, d.__class__)
            column.jx_type = native_type_to_json_type[column.es_type]
        else:
            for name, value in d.items():
                full_name = concat_field(parent, name)
                column = columns[full_name]
                if not column:
                    column = Column(
                        name=concat_field(table_name, full_name),
                        es_column=full_name,
                        es_index=".",
                        es_type=value.__class__.__name__,
                        jx_type=None,  # WILL BE SET BELOW
                        last_updated=Date.now(),
                        nested_path=nested_path,
                    )
                    columns.add(column)
                if is_container(value):  # GET TYPE OF MULTIVALUE
                    v = list(value)
                    if len(v) == 0:
                        this_type = none_type.__name__
                    elif len(v) == 1:
                        this_type = v[0].__class__.__name__
                    else:
                        this_type = reduce(_merge_python_type,
                                           (vi.__class__.__name__
                                            for vi in value))
                else:
                    this_type = value.__class__.__name__
                column.es_type = _merge_python_type(column.es_type, this_type)
                try:
                    column.jx_type = native_type_to_json_type[column.es_type]
                except Exception as e:
                    raise e

                if this_type in {"object", "dict", "Mapping", "Data"}:
                    _get_schema_from_list([value], table_name, full_name,
                                          nested_path, columns,
                                          native_type_to_json_type)
                elif this_type in {"list", "FlatList"}:
                    np = listwrap(nested_path)
                    newpath = unwraplist(
                        [join_field(split_field(np[0]) + [name])] + np)
                    _get_schema_from_list(value, table_name, full_name,
                                          newpath, columns)
Exemplo n.º 33
0
                    _get_schema_from_list(value, table_name, full_name,
                                          newpath, columns)


def get_id(column):
    """
    :param column:
    :return: Elasticsearch id for column
    """
    return column.es_index + "|" + column.es_column


META_COLUMNS_DESC = TableDesc(name=META_COLUMNS_NAME,
                              url=None,
                              query_path=ROOT_PATH,
                              last_updated=Date.now(),
                              columns=wrap([
                                  Column(
                                      name=c,
                                      es_index=META_COLUMNS_NAME,
                                      es_column=c,
                                      es_type="keyword",
                                      jx_type=STRING,
                                      last_updated=Date.now(),
                                      nested_path=ROOT_PATH,
                                  ) for c in [
                                      "name",
                                      "es_type",
                                      "jx_type",
                                      "nested_path",
                                      "es_column",
Exemplo n.º 34
0
    def extract(self, settings, force, restart, start, merge):
        if not settings.extractor.app_name:
            Log.error("Expecting an extractor.app_name in config file")

        # SETUP DESTINATION
        destination = bigquery.Dataset(
            dataset=settings.extractor.app_name,
            kwargs=settings.destination).get_or_create_table(
                settings.destination)

        try:
            if merge:
                with Timer("merge shards"):
                    destination.merge_shards()

            # RECOVER LAST SQL STATE
            redis = Redis.from_url(REDIS_URL)
            state = redis.get(settings.extractor.key)

            if start:
                state = start, 0
            elif restart or not state:
                state = (0, 0)
                redis.set(settings.extractor.key,
                          value2json(state).encode("utf8"))
            else:
                state = json2value(state.decode("utf8"))

            last_modified, job_id = state

            # SCAN SCHEMA, GENERATE EXTRACTION SQL
            extractor = MySqlSnowflakeExtractor(settings.source)
            canonical_sql = extractor.get_sql(SQL("SELECT 0"))

            # ENSURE SCHEMA HAS NOT CHANGED SINCE LAST RUN
            old_sql = redis.get(settings.extractor.sql)
            if old_sql and old_sql.decode("utf8") != canonical_sql.sql:
                if force:
                    Log.warning("Schema has changed")
                else:
                    Log.error("Schema has changed")
            redis.set(settings.extractor.sql, canonical_sql.sql.encode("utf8"))

            # SETUP SOURCE
            source = MySQL(settings.source.database)

            while True:
                Log.note(
                    "Extracting jobs for last_modified={{last_modified|datetime|quote}}, job.id={{job_id}}",
                    last_modified=last_modified,
                    job_id=job_id,
                )

                # Example: job.id ==283890114
                # get_ids = ConcatSQL(
                #     (SQL_SELECT, sql_alias(quote_value(283890114), "id"))
                # )
                get_ids = sql_query({
                    "from": "job",
                    "select": ["id"],
                    "where": {
                        "or": [
                            {
                                "gt": {
                                    "last_modified": Date(last_modified)
                                }
                            },
                            {
                                "and": [
                                    {
                                        "eq": {
                                            "last_modified":
                                            Date(last_modified)
                                        }
                                    },
                                    {
                                        "gt": {
                                            "id": job_id
                                        }
                                    },
                                ]
                            },
                        ]
                    },
                    "sort": ["last_modified", "id"],
                    "limit": settings.extractor.chunk_size,
                })
                sql = extractor.get_sql(get_ids)

                # PULL FROM source, AND PUSH TO destination
                acc = []
                with source.transaction():
                    cursor = source.query(sql, stream=True, row_tuples=True)
                    extractor.construct_docs(cursor, acc.append, False)
                if not acc:
                    break

                # SOME LIMITS PLACES ON STRING SIZE
                for fl in jx.drill(acc, "job_log.failure_line"):
                    fl.message = strings.limit(fl.message, 10000)
                for r in acc:
                    r.etl.timestamp = Date.now()
                destination.extend(acc)

                # RECORD THE STATE
                last_doc = acc[-1]
                last_modified, job_id = last_doc.last_modified, last_doc.id
                redis.set(
                    settings.extractor.key,
                    value2json((last_modified, job_id)).encode("utf8"),
                )

                if len(acc) < settings.extractor.chunk_size:
                    break

        except Exception as e:
            Log.warning("problem with extraction", cause=e)

        Log.note("done job extraction")

        try:
            with Timer("merge shards"):
                destination.merge_shards()
        except Exception as e:
            Log.warning("problem with merge", cause=e)

        Log.note("done job merge")
def doc_to_column(doc):
    now = Date.now()
    try:
        doc = to_data(untyped(doc))

        # I HAVE MANAGED TO MAKE MANY MISTAKES WRITING COLUMNS TO ES. HERE ARE THE FIXES

        # FIX
        if not doc.last_updated:
            doc.last_updated = Date.now() - YEAR

        # FIX
        if doc.es_type == None:
            if doc.jx_type == OBJECT:
                doc.es_type = "object"
            else:
                Log.warning("{{doc}} has no es_type", doc=doc)

        # FIX
        if doc.es_type == "nested":
            doc.multi = 1001
        if doc.multi == None:
            doc.multi = 1

        # FIX
        if doc.es_column.endswith("." + NESTED_TYPE):
            if doc.jx_type == OBJECT:
                doc.jx_type = NESTED
                doc.last_updated = now
            if doc.es_type == "nested":
                doc.es_type = "nested"
                doc.last_updated = now

        # FIX
        doc.nested_path = tuple(listwrap(doc.nested_path))
        if last(split_field(
                doc.es_column)) == NESTED_TYPE and doc.es_type != "nested":
            doc.es_type = "nested"
            doc.jx_type = NESTED
            doc.multi = 1001
            doc.last_updated = now

        # FIX
        expected_nested_path = get_nested_path(doc.es_column)
        if len(doc.nested_path) > 1 and doc.nested_path[-2] == '.':
            doc.nested_path = doc.nested_path[:-1]
            doc.last_updated = now

        # FIX
        if untype_path(doc.es_column) == doc.es_column:
            if doc.nested_path != (".", ):
                if doc.es_index in {"repo"}:
                    pass
                else:
                    Log.note("not expected")
                    doc.nested_path = expected_nested_path
                    doc.last_updated = now
        else:
            if doc.nested_path != expected_nested_path:
                doc.nested_path = expected_nested_path
                doc.last_updated = now

        # FIX
        if last(split_field(doc.es_column)) == EXISTS_TYPE:
            if doc.jx_type != EXISTS:
                doc.jx_type = EXISTS
                doc.last_updated = now

            if doc.cardinality == None:
                doc.cardinality = 1
                doc.last_updated = now

        # FIX
        if doc.jx_type in STRUCT:
            if doc.cardinality not in [0, 1]:
                doc.cardinality = 1  # DO NOT KNOW IF EXISTS OR NOT
                doc.last_updated = now

        return Column(**doc)
    except Exception as e:
        try:
            mark_as_deleted(Column(**doc), now)
        except Exception:
            pass
        return None
    def update(self, command):
        self.dirty = True
        try:
            command = to_data(command)
            DEBUG and Log.note(
                "Update {{timestamp}}: {{command|json}}",
                command=command,
                timestamp=Date(command["set"].last_updated),
            )
            eq = command.where.eq
            if eq.es_index:
                if len(eq) == 1:
                    if unwraplist(command.clear) == ".":
                        d = self.data
                        i = eq.es_index
                        with self.locker:
                            cols = d[i]
                            del d[i]

                        for c in cols:
                            self.remove(c)
                        return

                    # FASTEST
                    all_columns = self.data.get(eq.es_index, {}).values()
                    with self.locker:
                        columns = [c for cs in all_columns for c in cs]
                elif eq.es_column and len(eq) == 2:
                    # FASTER
                    all_columns = self.data.get(eq.es_index, {}).values()
                    with self.locker:
                        columns = [
                            c for cs in all_columns for c in cs
                            if c.es_column == eq.es_column
                        ]

                else:
                    # SLOWER
                    all_columns = self.data.get(eq.es_index, {}).values()
                    with self.locker:
                        columns = [
                            c for cs in all_columns for c in cs
                            if all(c[k] == v for k, v in
                                   eq.items())  # THIS LINE IS VERY SLOW
                        ]
            else:
                columns = list(self)
                columns = jx.filter(columns, command.where)

            with self.locker:
                for col in columns:
                    DEBUG and Log.note(
                        "update column {{table}}.{{column}}",
                        table=col.es_index,
                        column=col.es_column,
                    )
                    for k in command["clear"]:
                        if k == ".":
                            mark_as_deleted(col, Date.now())
                            self.for_es_update.add(col)
                            lst = self.data[col.es_index]
                            cols = lst[col.name]
                            cols.remove(col)
                            if len(cols) == 0:
                                del lst[col.name]
                                if len(lst) == 0:
                                    del self.data[col.es_index]
                            break
                        else:
                            col[k] = None
                    else:
                        # DID NOT DELETE COLUMNM ("."), CONTINUE TO SET PROPERTIES
                        for k, v in command.set.items():
                            col[k] = v
                        self.for_es_update.add(col)

        except Exception as e:
            Log.error("should not happen", cause=e)
Exemplo n.º 37
0
                    np = listwrap(nested_path)
                    newpath = unwraplist([join_field(split_field(np[0]) + [name])] + np)
                    _get_schema_from_list(
                        value, table_name, full_name, newpath, columns
                    )


METADATA_COLUMNS = (
    [
        Column(
            name=c,
            es_index="meta.columns",
            es_column=c,
            es_type="keyword",
            jx_type=STRING,
            last_updated=Date.now(),
            nested_path=ROOT_PATH,
        )
        for c in [
            "name",
            "es_type",
            "jx_type",
            "nested_path",
            "es_column",
            "es_index",
            "partitions",
        ]
    ]
    + [
        Column(
            name=c,
Exemplo n.º 38
0
    def _update_cardinality(self, c):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if c.type in STRUCT:
            Log.error("not supported")
        try:
            if c.es_index == "meta.columns":
                with self.meta.columns.locker:
                    partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.columns),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            if c.es_index == "meta.tables":
                with self.meta.columns.locker:
                    partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.tables),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return

            es_index = c.es_index.split(".")[0]
            result = self.default_es.post("/" + es_index + "/_search", data={
                "aggs": {c.names["."]: _counting_query(c)},
                "size": 0
            })
            r = result.aggregations.values()[0]
            count = result.hits.total
            cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count==0 else None)
            if cardinality == None:
                Log.error("logic error")

            query = Data(size=0)
            if cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                if DEBUG:
                    Log.note("{{table}}.{{field}} has {{num}} parts", table=c.es_index, field=c.es_column, num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                if DEBUG:
                    Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            elif len(c.nested_path) != 1:
                query.aggs[literal_field(c.names["."])] = {
                    "nested": {"path": c.nested_path[0]},
                    "aggs": {"_nested": {"terms": {"field": c.es_column, "size": 0}}}
                }
            else:
                query.aggs[literal_field(c.names["."])] = {"terms": {"field": c.es_column, "size": 0}}

            result = self.default_es.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations.values()[0]
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            if DEBUG:
                Log.note("{{field}} has {{parts}}", field=c.name, parts=parts)
            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })
        except Exception as e:
            if "IndexMissingException" in e and c.es_index.startswith(TEST_TABLE_PREFIX):
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": 0,
                            "cardinality": 0,
                            "last_updated": Date.now()
                        },
                        "clear":[
                            "partitions"
                        ],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"names.\\.": ".", "es_index": c.es_index, "es_column": c.es_column}}
                })
                Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=c, cause=e)
Exemplo n.º 39
0
    def _normalize_revision(self, r, found_revision, push, get_diff, get_moves):
        new_names = set(r.keys()) - KNOWN_TAGS
        if new_names and not r.tags:
            Log.warning(
                "hg is returning new property names {{names|quote}} for {{changeset}} from {{url}}",
                names=new_names,
                changeset=r.node,
                url=found_revision.branch.url
            )

        changeset = Changeset(
            id=r.node,
            id12=r.node[0:12],
            author=r.user,
            description=strings.limit(coalesce(r.description, r.desc), 2000),
            date=parse_hg_date(r.date),
            files=r.files,
            backedoutby=r.backedoutby if r.backedoutby else None,
            bug=self._extract_bug_id(r.description)
        )
        rev = Revision(
            branch=found_revision.branch,
            index=r.rev,
            changeset=changeset,
            parents=unwraplist(list(set(r.parents))),
            children=unwraplist(list(set(r.children))),
            push=push,
            phase=r.phase,
            bookmarks=unwraplist(r.bookmarks),
            landingsystem=r.landingsystem,
            etl={"timestamp": Date.now().unix, "machine": machine_metadata}
        )

        r.pushuser = None
        r.pushdate = None
        r.pushid = None
        r.node = None
        r.user = None
        r.desc = None
        r.description = None
        r.date = None
        r.files = None
        r.backedoutby = None
        r.parents = None
        r.children = None
        r.bookmarks = None
        r.landingsystem = None

        set_default(rev, r)

        # ADD THE DIFF
        if get_diff:
            rev.changeset.diff = self._get_json_diff_from_hg(rev)
        if get_moves:
            rev.changeset.moves = self._get_moves_from_hg(rev)

        try:
            _id = coalesce(rev.changeset.id12, "") + "-" + rev.branch.name + "-" + coalesce(rev.branch.locale, DEFAULT_LOCALE)
            with self.es_locker:
                self.es.add({"id": _id, "value": rev})
        except Exception as e:
            e = Except.wrap(e)
            Log.warning("Did not save to ES, waiting {{duration}} seconds", duration=WAIT_AFTER_NODE_FAILURE, cause=e)
            Till(seconds=WAIT_AFTER_NODE_FAILURE).wait()
            if "FORBIDDEN/12/index read-only" in e:
                pass  # KNOWN FAILURE MODE

        return rev
Exemplo n.º 40
0
def doc_to_column(doc):
    kwargs = set_default(untyped(doc), {"last_updated": Date.now() - YEAR})
    return Column(**wrap(kwargs))
Exemplo n.º 41
0
    def get_revision(self, revision, locale=None, get_diff=False, get_moves=True):
        """
        EXPECTING INCOMPLETE revision OBJECT
        RETURNS revision
        """
        rev = revision.changeset.id
        if not rev:
            return Null
        elif rev == "None":
            return Null
        elif revision.branch.name == None:
            return Null
        locale = coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)
        output = self._get_from_elasticsearch(revision, locale=locale, get_diff=get_diff)
        if output:
            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if not get_moves:
                output.changeset.moves = None
            DEBUG and Log.note("Got hg ({{branch}}, {{locale}}, {{revision}}) from ES", branch=output.branch.name, locale=locale, revision=output.changeset.id)
            if output.push.date >= Date.now()-MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents)))
                self.todo.add((output.branch, listwrap(output.children)))
            if output.push.date:
                return output

        # RATE LIMIT CALLS TO HG (CACHE MISSES)
        next_cache_miss = self.last_cache_miss + (Random.float(WAIT_AFTER_CACHE_MISS * 2) * SECOND)
        self.last_cache_miss = Date.now()
        if next_cache_miss > self.last_cache_miss:
            Log.note("delaying next hg call for {{seconds|round(decimal=1)}}", seconds=next_cache_miss - self.last_cache_miss)
            Till(till=next_cache_miss.unix).wait()

        found_revision = copy(revision)
        if isinstance(found_revision.branch, (text_type, binary_type)):
            lower_name = found_revision.branch.lower()
        else:
            lower_name = found_revision.branch.name.lower()

        if not lower_name:
            Log.error("Defective revision? {{rev|json}}", rev=found_revision.branch)

        b = found_revision.branch = self.branches[(lower_name, locale)]
        if not b:
            b = found_revision.branch = self.branches[(lower_name, DEFAULT_LOCALE)]
            if not b:
                Log.warning("can not find branch ({{branch}}, {{locale}})", branch=lower_name, locale=locale)
                return Null

        if Date.now() - Date(b.etl.timestamp) > _OLD_BRANCH:
            self.branches = _hg_branches.get_branches(kwargs=self.settings)

        push = self._get_push(found_revision.branch, found_revision.changeset.id)

        url1 = found_revision.branch.url.rstrip("/") + "/json-info?node=" + found_revision.changeset.id[0:12]
        url2 = found_revision.branch.url.rstrip("/") + "/json-rev/" + found_revision.changeset.id[0:12]
        with Explanation("get revision from {{url}}", url=url1, debug=DEBUG):
            raw_rev2 = Null
            try:
                raw_rev1 = self._get_raw_json_info(url1, found_revision.branch)
                raw_rev2 = self._get_raw_json_rev(url2, found_revision.branch)
            except Exception as e:
                if "Hg denies it exists" in e:
                    raw_rev1 = Data(node=revision.changeset.id)
                else:
                    raise e
            output = self._normalize_revision(set_default(raw_rev1, raw_rev2), found_revision, push, get_diff, get_moves)
            if output.push.date >= Date.now()-MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents)))
                self.todo.add((output.branch, listwrap(output.children)))

            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if not get_moves:
                output.changeset.moves = None
            return output
Exemplo n.º 42
0
def proto_name(prefix, timestamp=None):
    if not timestamp:
        timestamp = Date.now()
    else:
        timestamp = Date(timestamp)
    return prefix + timestamp.format(INDEX_DATE_FORMAT)
Exemplo n.º 43
0
def _get_schema_from_list(frum, table_name, parent, nested_path, columns):
    """
    :param frum: The list
    :param table_name: Name of the table this list holds records for
    :param parent: parent path
    :param nested_path: each nested array, in reverse order
    :param columns: map from full name to column definition
    :return:
    """

    for d in frum:
        row_type = python_type_to_json_type[d.__class__]

        if row_type != "object":
            # EXPECTING PRIMITIVE VALUE
            full_name = parent
            column = columns[full_name]
            if not column:
                column = Column(
                    name=concat_field(table_name, full_name),
                    es_column=full_name,
                    es_index=".",
                    es_type=d.__class__.__name__,
                    jx_type=None,  # WILL BE SET BELOW
                    last_updated=Date.now(),
                    nested_path=nested_path,
                )
                columns.add(column)
            column.es_type = _merge_python_type(column.es_type, d.__class__)
            column.jx_type = python_type_to_json_type[column.es_type]
        else:
            for name, value in d.items():
                full_name = concat_field(parent, name)
                column = columns[full_name]
                if not column:
                    column = Column(
                        name=concat_field(table_name, full_name),
                        es_column=full_name,
                        es_index=".",
                        es_type=value.__class__.__name__,
                        jx_type=None,  # WILL BE SET BELOW
                        last_updated=Date.now(),
                        nested_path=nested_path,
                    )
                    columns.add(column)
                if is_container(value):  # GET TYPE OF MULTIVALUE
                    v = list(value)
                    if len(v) == 0:
                        this_type = none_type.__name__
                    elif len(v) == 1:
                        this_type = v[0].__class__.__name__
                    else:
                        this_type = reduce(
                            _merge_python_type, (vi.__class__.__name__ for vi in value)
                        )
                else:
                    this_type = value.__class__.__name__
                column.es_type = _merge_python_type(column.es_type, this_type)
                column.jx_type = python_type_to_json_type[column.es_type]

                if this_type in {"object", "dict", "Mapping", "Data"}:
                    _get_schema_from_list(
                        [value], table_name, full_name, nested_path, columns
                    )
                elif this_type in {"list", "FlatList"}:
                    np = listwrap(nested_path)
                    newpath = unwraplist([join_field(split_field(np[0]) + [name])] + np)
                    _get_schema_from_list(
                        value, table_name, full_name, newpath, columns
                    )
Exemplo n.º 44
0
def _get_single_branch_from_hg(settings, description, dir):
    if dir == "users":
        return []
    response = http.get(settings.url + "/" + dir)
    doc = BeautifulSoup(response.all_content, "html.parser")

    output = []
    try:
        all_branches = doc("table")[0]
    except Exception:
        return []

    for i, b in enumerate(all_branches("tr")):
        if i == 0:
            continue  # IGNORE HEADER
        columns = b("td")

        try:
            path = columns[0].a.get('href')
            if path == "/":
                continue

            name, desc, last_used = [c.text.strip() for c in columns][0:3]

            if last_used.startswith('at'):
                last_used = last_used[2:]

            detail = Data(
                name=name.lower(),
                locale=DEFAULT_LOCALE,
                parent_name=description,
                url=settings.url + path,
                description=desc,
                last_used=Date(last_used),
                etl={"timestamp": Date.now()}
            )
            if detail.description == "unknown":
                detail.description = None

            # SOME BRANCHES HAVE NAME COLLISIONS, IGNORE LEAST POPULAR
            if path in [
                "/projects/dxr/",                   # moved to webtools
                "/build/compare-locales/",          # ?build team likes to clone?
                "/build/puppet/",                   # ?build team likes to clone?
                "/SeaMonkey/puppet/",               # looses the popularity contest
                "/releases/gaia-l10n/v1_2/en-US/",  # use default branch
                "/releases/gaia-l10n/v1_3/en-US/",  # use default branch
                "/releases/gaia-l10n/v1_4/en-US/",  # use default branch
                "/releases/gaia-l10n/v2_0/en-US/",  # use default branch
                "/releases/gaia-l10n/v2_1/en-US/",  # use default branch
                "/build/autoland/"
            ]:
                continue

            # MARKUP BRANCH IF LOCALE SPECIFIC
            if path.startswith("/l10n-central"):
                _path = path.strip("/").split("/")
                detail.locale = _path[-1]
                detail.name = "mozilla-central"
            elif path.startswith("/releases/l10n/"):
                _path = path.strip("/").split("/")
                detail.locale = _path[-1]
                detail.name = _path[-2].lower()
            elif path.startswith("/releases/gaia-l10n/"):
                _path = path.strip("/").split("/")
                detail.locale = _path[-1]
                detail.name = "gaia-" + _path[-2][1::]
            elif path.startswith("/weave-l10n"):
                _path = path.strip("/").split("/")
                detail.locale = _path[-1]
                detail.name = "weave"

            if BRANCH_WHITELIST is not None:
                found = False
                for br in BRANCH_WHITELIST:
                    if br in str(detail.name):
                        found = True
                        break
                if not found:
                    continue

            Log.note("Branch {{name}} {{locale}}", name=detail.name, locale=detail.locale)
            output.append(detail)
        except Exception as e:
            Log.warning("branch digestion problem", cause=e)

    return output
Exemplo n.º 45
0
    def get_revision(self, revision, locale=None, get_diff=False):
        """
        EXPECTING INCOMPLETE revision OBJECT
        RETURNS revision
        """
        rev = revision.changeset.id
        if not rev:
            return Null
        elif rev == "None":
            return Null
        elif revision.branch.name == None:
            return Null
        locale = coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)
        output = self._get_from_elasticsearch(revision, locale=locale, get_diff=get_diff)
        if output:
            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if DEBUG:
                Log.note("Got hg ({{branch}}, {{locale}}, {{revision}}) from ES", branch=output.branch.name, locale=locale, revision=output.changeset.id)
            if output.push.date >= Date.now()-MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents)))
                self.todo.add((output.branch, listwrap(output.children)))
            if output.push.date:
                return output

        found_revision = copy(revision)
        if isinstance(found_revision.branch, (text_type, binary_type)):
            lower_name = found_revision.branch.lower()
        else:
            lower_name = found_revision.branch.name.lower()

        if not lower_name:
            Log.error("Defective revision? {{rev|json}}", rev=found_revision.branch)

        b = found_revision.branch = self.branches[(lower_name, locale)]
        if not b:
            b = found_revision.branch = self.branches[(lower_name, DEFAULT_LOCALE)]
            if not b:
                Log.error("can not find branch ({{branch}}, {{locale}})", branch=lower_name, locale=locale)

        if Date.now() - Date(b.etl.timestamp) > _OLD_BRANCH:
            self.branches = _hg_branches.get_branches(kwargs=self.settings)

        push = self._get_push(found_revision.branch, found_revision.changeset.id)

        url1 = found_revision.branch.url.rstrip("/") + "/json-info?node=" + found_revision.changeset.id[0:12]
        url2 = found_revision.branch.url.rstrip("/") + "/json-rev/" + found_revision.changeset.id[0:12]
        with Explanation("get revision from {{url}}", url=url1, debug=DEBUG):
            raw_rev2 = Null
            try:
                raw_rev1 = self._get_raw_json_info(url1, found_revision.branch)
                raw_rev2 = self._get_raw_json_rev(url2, found_revision.branch)
            except Exception as e:
                if "Hg denies it exists" in e:
                    raw_rev1 = Data(node=revision.changeset.id)
                else:
                    raise e
            output = self._normalize_revision(set_default(raw_rev1, raw_rev2), found_revision, push, get_diff)
            if output.push.date >= Date.now()-MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents)))
                self.todo.add((output.branch, listwrap(output.children)))

            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            return output
Exemplo n.º 46
0
 def take_lock(please_stop):
     with locker:
         locker.wait(Till(seconds=1))
         locker.wait(Till(seconds=1))
         locker.wait(Till(till=(Date.now()+SECOND).unix))
Exemplo n.º 47
0
def mark_as_deleted(col):
    col.count = 0
    col.cardinality = 0
    col.multi = 0
    col.partitions = None
    col.last_updated = Date.now()
Exemplo n.º 48
0
    def _update_cardinality(self, c):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if c.type in STRUCT:
            Log.error("not supported")
        try:
            if c.es_index == "meta.columns":
                with self.meta.columns.locker:
                    partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.columns),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            if c.es_index == "meta.tables":
                with self.meta.columns.locker:
                    partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.tables),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return

            es_index = c.es_index.split(".")[0]
            result = self.default_es.post("/" + es_index + "/_search", data={
                "aggs": {c.names["."]: _counting_query(c)},
                "size": 0
            })
            r = result.aggregations.values()[0]
            count = result.hits.total
            cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count == 0 else None)
            if cardinality == None:
                Log.error("logic error")

            query = Data(size=0)
            if cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                if DEBUG:
                    Log.note("{{table}}.{{field}} has {{num}} parts", table=c.es_index, field=c.es_column, num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                if DEBUG:
                    Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            elif len(c.nested_path) != 1:
                query.aggs[literal_field(c.names["."])] = {
                    "nested": {"path": c.nested_path[0]},
                    "aggs": {"_nested": {"terms": {"field": c.es_column, "size": 0}}}
                }
            else:
                query.aggs[literal_field(c.names["."])] = {"terms": {"field": c.es_column, "size": 0}}

            result = self.default_es.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations.values()[0]
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            if DEBUG:
                Log.note("{{field}} has {{parts}}", field=c.names["."], parts=parts)
            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })
        except Exception as e:
            if "IndexMissingException" in e and c.es_index.startswith(TEST_TABLE_PREFIX):
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": 0,
                            "cardinality": 0,
                            "last_updated": Date.now()
                        },
                        "clear":[
                            "partitions"
                        ],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"names.\\.": ".", "es_index": c.es_index, "es_column": c.es_column}}
                })
                Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=c, cause=e)
Exemplo n.º 49
0
def _get_single_branch_from_hg(settings, description, dir):
    if dir == "users":
        return []
    response = http.get(settings.url + "/" + dir)
    doc = BeautifulSoup(response.all_content, "html.parser")

    output = []
    try:
        all_branches = doc("table")[0]
    except Exception:
        return []

    for i, b in enumerate(all_branches("tr")):
        if i == 0:
            continue  # IGNORE HEADER
        columns = b("td")

        try:
            path = columns[0].a.get('href')
            if path == "/":
                continue

            name, desc, last_used = [c.text.strip() for c in columns][0:3]

            if last_used.startswith('at'):
                last_used = last_used[2:]

            detail = Data(
                name=name.lower(),
                locale=DEFAULT_LOCALE,
                parent_name=description,
                url=settings.url + path,
                description=desc,
                last_used=Date(last_used),
                etl={"timestamp": Date.now()}
            )
            if detail.description == "unknown":
                detail.description = None

            # SOME BRANCHES HAVE NAME COLLISIONS, IGNORE LEAST POPULAR
            if path in [
                "/projects/dxr/",                   # moved to webtools
                "/build/compare-locales/",          # ?build team likes to clone?
                "/build/puppet/",                   # ?build team likes to clone?
                "/SeaMonkey/puppet/",               # looses the popularity contest
                "/releases/gaia-l10n/v1_2/en-US/",  # use default branch
                "/releases/gaia-l10n/v1_3/en-US/",  # use default branch
                "/releases/gaia-l10n/v1_4/en-US/",  # use default branch
                "/releases/gaia-l10n/v2_0/en-US/",  # use default branch
                "/releases/gaia-l10n/v2_1/en-US/",  # use default branch
                "/build/autoland/"
            ]:
                continue

            # MARKUP BRANCH IF LOCALE SPECIFIC
            if path.startswith("/l10n-central"):
                _path = path.strip("/").split("/")
                detail.locale = _path[-1]
                detail.name = "mozilla-central"
            elif path.startswith("/releases/l10n/"):
                _path = path.strip("/").split("/")
                detail.locale = _path[-1]
                detail.name = _path[-2].lower()
            elif path.startswith("/releases/gaia-l10n/"):
                _path = path.strip("/").split("/")
                detail.locale = _path[-1]
                detail.name = "gaia-" + _path[-2][1::]
            elif path.startswith("/weave-l10n"):
                _path = path.strip("/").split("/")
                detail.locale = _path[-1]
                detail.name = "weave"

            Log.note("Branch {{name}} {{locale}}", name=detail.name, locale=detail.locale)
            output.append(detail)
        except Exception as e:
            Log.warning("branch digestion problem", cause=e)

    return output
Exemplo n.º 50
0
    def columns(self):
        if not self._columns:
            now = Date.now()
            columns = []

            def parse_schema(schema, tops, es_type_info, jx_path, nested_path, es_path):
                if is_text(schema):
                    json_type = schema
                    expected_es_type = json_type_to_bq_type[json_type]
                    if es_type_info and es_type_info != expected_es_type:
                        Log.error(
                            "expecting {{path}} to be of type {{expected_type}} not of type {{observed_type}}",
                            path=jx_path,
                            expected_type=expected_es_type,
                            observed_type=es_type_info
                        )
                    c = jx_base.Column(
                        name=join_field(jx_path),
                        es_column=coalesce(tops, text(es_path)),
                        es_index=self.es_index,
                        es_type=coalesce(es_type_info, expected_es_type),
                        jx_type=json_type,
                        nested_path=nested_path,
                        last_updated=now,
                    )
                    columns.append(c)
                else:
                    c = jx_base.Column(
                        name=join_field(jx_path),
                        es_column=text(es_path),
                        es_index=self.es_index,
                        es_type="RECORD",
                        jx_type=OBJECT,
                        nested_path=nested_path,
                        last_updated=now,
                    )
                    columns.append(c)
                    count = len(columns)
                    for k, s in schema.items():
                        if k == NESTED_TYPE:
                            c.jx_type = NESTED
                            parse_schema(
                                s,
                                tops if is_text(tops) else tops[k],
                                es_type_info
                                if is_text(es_type_info)
                                else es_type_info[k],
                                jx_path + (k,),
                                (jx_path,) + nested_path,
                                es_path + escape_name(k),
                            )
                        else:
                            parse_schema(
                                s,
                                tops if is_text(tops) else tops[k],
                                es_type_info
                                if is_text(es_type_info)
                                else es_type_info[k],
                                jx_path + (k,),
                                nested_path,
                                es_path + escape_name(k),
                            )
                    if is_text(tops) and len(columns) > count + 1:
                        Log.error(
                            "too many top level fields at {{field}}:",
                            field=join_field(jx_path),
                        )

            parse_schema(
                self.schema,
                self.top_level_fields,
                self._es_type_info,
                (),
                (".",),
                ApiName(),
            )
            self._columns = columns

            self._top_level_fields = {}
            for path, field in wrap(self.top_level_fields).leaves():
                leaves = self.leaves(path)
                if not leaves:
                    continue
                if len(leaves) > 1:
                    Log.error(
                        "expecting {{path}} to have just one primitive value", path=path
                    )
                specific_path = first(leaves).name
                self._top_level_fields[
                    ".".join(map(text, map(escape_name, split_field(specific_path))))
                ] = field

            self._partition = Partition(kwargs=self.partition, flake=self)

        return self._columns