def query(self, q): frum = self if is_aggs(q): return cube_aggs(frum, q) columns = wrap({s.name: s for s in self.select + self.edges}) # DEFER TO ListContainer from jx_python.containers.list_usingPythonList import ListContainer frum = ListContainer(name="", data=frum.values(), schema=columns) return frum.query(q)
def query(self, q): frum = self if is_aggs(q): return cube_aggs(frum, q) columns = wrap({s.name: s for s in self.select + self.edges}) # DEFER TO ListContainer from jx_python.containers.list_usingPythonList import ListContainer frum = ListContainer(name="", data=frum.values(), schema=columns) return frum.query(q)
def test_in_w_multi_value(self): data = [ {"a": "e"}, {"a": "c"}, {"a": ["e"]}, {"a": ["c"]}, {"a": ["e", "c"]}, {} ] result = jx.run({ "from": ListContainer(".", data), "select": [ "a", {"name": "is_e", "value": {"when": {"in": [{"literal": "e"}, "a"]}, "then": 1, "else": 0}}, {"name": "not_e", "value": {"when": {"not": {"in": [{"literal": "e"}, "a"]}}, "then": 1, "else": 0}}, {"name": "is_c", "value": {"when": {"in": [{"literal": "c"}, "a"]}, "then": 1, "else": 0}} ] }) expected = {"data": [ {"a": "e", "is_e": 1, "not_e": 0, "is_c": 0}, {"a": "c", "is_e": 0, "not_e": 1, "is_c": 1}, {"a": "e", "is_e": 1, "not_e": 0, "is_c": 0}, {"a": "c", "is_e": 0, "not_e": 1, "is_c": 1}, {"a": ["e", "c"], "is_e": 1, "not_e": 0, "is_c": 1}, {"a": NULL, "is_e": 0, "not_e": 1, "is_c": 0} ]} self.assertAlmostEqual(result, expected)
def denormalized(self): """ THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES """ with self.locker: self._update_meta() output = [ { "table": c.es_index, "name": untype_path(c.name), "cardinality": c.cardinality, "es_column": c.es_column, "es_index": c.es_index, "last_updated": c.last_updated, "count": c.count, "nested_path": [unnest_path(n) for n in c.nested_path], "es_type": c.es_type, "type": c.jx_type, } for tname, css in self.data.items() for cname, cs in css.items() for c in cs if c.jx_type not in STRUCT # and c.es_column != "_id" ] from jx_python.containers.list_usingPythonList import ListContainer return ListContainer( self.name, data=output, schema=jx_base.Schema(META_COLUMNS_NAME, SIMPLE_METADATA_COLUMNS), )
def find_container(frum, after): """ :param frum: :return: """ global namespace if not namespace: if not container.config.default.settings: Log.error( "expecting jx_base.container.config.default.settings to contain default elasticsearch connection info" ) namespace = ElasticsearchMetadata(container.config.default.settings) if not frum: Log.error("expecting json query expression with from clause") # FORCE A RELOAD namespace.get_columns(frum, after=after) if is_text(frum): if frum in container_cache: return container_cache[frum] path = split_field(frum) if path[0] == "meta": if path[1] == "columns": return namespace.meta.columns.denormalized() elif path[1] == "tables": return namespace.meta.tables else: fact_table_name = join_field(path[:2]) else: fact_table_name = path[0] type_ = container.config.default.type settings = set_default( { "alias": fact_table_name, "name": frum, "exists": True }, container.config.default.settings, ) settings.type = None output = container.type2container[type_](settings) container_cache[frum] = output return output elif is_data(frum) and frum.type and container.type2container[frum.type]: # TODO: Ensure the frum.name is set, so we capture the deep queries if not frum.type: Log.error("Expecting from clause to have a 'type' property") return container.type2container[frum.type](frum.settings) elif is_data(frum) and (frum["from"] or is_container(frum["from"])): from jx_base.query import QueryOp return QueryOp.wrap(frum) elif is_container(frum): return ListContainer("test_list", frum) else: return frum
def denormalized(self): """ THE INTERNAL STRUCTURE FOR THE COLUMN METADATA IS VERY DIFFERENT FROM THE DENORMALIZED PERSPECITVE. THIS PROVIDES THAT PERSPECTIVE FOR QUERIES """ output = [ { "table": concat_field(c.es_index, untype_path(table)), "name": untype_path(name), "cardinality": c.cardinality, "es_column": c.es_column, "es_index": c.es_index, "last_updated": c.last_updated, "count": c.count, "nested_path": [unnest_path(n) for n in c.nested_path], "type": c.type } for tname, css in self.data.items() for cname, cs in css.items() for c in cs if c.type not in STRUCT # and c.es_column != "_id" for table, name in c.names.items() ] if not self.meta_schema: self.meta_schema = get_schema_from_list("meta\\.columns", output) from jx_python.containers.list_usingPythonList import ListContainer return ListContainer("meta\\.columns", data=output, schema=self.meta_schema)
def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.index_to_alias = Relation_usingList() self.es_metadata = Null self.metadata_last_updated = Date.now() - OLD_METADATA self.meta = Data() self.meta.columns = ColumnList() self.alias_to_query_paths = { "meta.columns": [['.']], "meta.tables": [['.']] } self.alias_last_updated = { "meta.columns": Date.now(), "meta.tables": Date.now() } table_columns = metadata_tables() self.meta.tables = ListContainer( "meta.tables", [ # TableDesc("meta.columns", None, ".", Date.now()), # TableDesc("meta.tables", None, ".", Date.now()) ], jx_base.Schema(".", table_columns)) self.meta.columns.extend(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return
def query(self, query): # NOT EXPECTED TO BE RUN Log.error("not") with self.locker: self._update_meta() if not self._schema: self._schema = Schema(".", [ c for cs in self.data["meta.columns"].values() for c in cs ]) snapshot = self._all_columns() from jx_python.containers.list_usingPythonList import ListContainer query.frum = ListContainer("meta.columns", snapshot, self._schema) return jx.run(query)
def _get_queue(self, row): row = wrap(row) if row.json: row.value, row.json = json2value(row.json), None timestamp = Date(self.rollover_field(row.value)) if timestamp == None: return Null elif timestamp < Date.today() - self.rollover_max: return DATA_TOO_OLD rounded_timestamp = timestamp.floor(self.rollover_interval) with self.locker: queue = self.known_queues.get(rounded_timestamp.unix) if queue == None: candidates = jx.run({ "from": ListContainer(".", self.cluster.get_aliases()), "where": {"regex": {"index": self.settings.index + "\d\d\d\d\d\d\d\d_\d\d\d\d\d\d"}}, "sort": "index" }) best = None for c in candidates: c = wrap(c) c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT) if timestamp > c.date: best = c if not best or rounded_timestamp > best.date: if rounded_timestamp < wrap(candidates[-1]).date: es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings) else: try: es = self.cluster.create_index(create_timestamp=rounded_timestamp, kwargs=self.settings) es.add_alias(self.settings.index) except Exception as e: e = Except.wrap(e) if "IndexAlreadyExistsException" not in e: Log.error("Problem creating index", cause=e) return self._get_queue(row) # TRY AGAIN else: es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings) with suppress_exception: es.set_refresh_interval(seconds=60 * 5, timeout=5) self._delete_old_indexes(candidates) threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True) with self.locker: queue = self.known_queues[rounded_timestamp.unix] = threaded_queue return queue
def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.settings = kwargs self.too_old = TOO_OLD self.es_cluster = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.meta = Data() self.meta.columns = ColumnList(self.es_cluster) self.meta.columns.extend(META_TABLES_DESC.columns) self.meta.tables = ListContainer(META_TABLES_NAME, [], jx_base.Schema(".", META_TABLES_DESC.columns)) self.meta.table.extend([META_COLUMNS_DESC, META_TABLES_DESC]) self.alias_to_query_paths = {} for i, settings in self.es_cluster.get_metadata().indices.items(): if len(settings.aliases) == 0: alias = i elif len(settings.aliases) == 1: alias = first(settings.aliases) else: Log.error("expecting only one alias per index") desc = TableDesc( name=alias, url=None, query_path=ROOT_PATH, last_updated=Date.MIN, columns=[] ) self.meta.tables.add(desc) self.alias_to_query_paths[alias] = [desc.query_path] self.alias_to_query_paths[self._find_alias(alias)] = [desc.query_path] # WE MUST PAUSE? # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("not refresh metadata", self.not_monitor) return
def __init__(self, host, index, sql_file='metadata.sqlite', alias=None, name=None, port=9200, kwargs=None): if hasattr(self, "settings"): return self.too_old = TOO_OLD self.settings = kwargs self.default_name = coalesce(name, alias, index) self.default_es = elasticsearch.Cluster(kwargs=kwargs) self.index_does_not_exist = set() self.todo = Queue("refresh metadata", max=100000, unique=True) self.es_metadata = Null self.abs_columns = set() self.last_es_metadata = Date.now() - OLD_METADATA self.meta = Data() table_columns = metadata_tables() column_columns = metadata_columns() self.meta.tables = ListContainer( "meta.tables", [], wrap({c.names["."]: c for c in table_columns})) self.meta.columns = ColumnList() self.meta.columns.insert(column_columns) self.meta.columns.insert(table_columns) # TODO: fix monitor so it does not bring down ES if ENABLE_META_SCAN: self.worker = Thread.run("refresh metadata", self.monitor) else: self.worker = Thread.run("refresh metadata", self.not_monitor) return
def download_perfherder(desc, repo, id, dummy, framework): sig_result = http.get_json( "https://treeherder.mozilla.org/api/project/" + repo + "/performance/signatures/?format=json&framework=" + str(framework) + "&id=" + str(id)) signature = first(sig_result.keys()) data_result = http.get_json("https://treeherder.mozilla.org/api/project/" + repo + "/performance/data/?signatures=" + signature) Log.note( "{{result|json}}", result={ "name": desc, "data": jx.run({ "from": ListContainer("data", data_result[signature]), "sort": "push_timestamp", "select": "value" }).data }, )