def condense(self): """ :return: """ # MAKE NEW SHARD partition = JoinSQL( SQL_COMMA, [ quote_column(c.es_field) for f in listwrap(self.id.field) for c in self.flake.leaves(f) ], ) order_by = JoinSQL( SQL_COMMA, [ ConcatSQL(quote_column(c.es_field), SQL_DESC) for f in listwrap(self.id.version) for c in self.flake.leaves(f) ], ) # WRAP WITH etl.timestamp BEST SELECTION self.container.query_and_wait( ConcatSQL( SQL( # SOME KEYWORDS: ROWNUM RANK "SELECT * EXCEPT (_rank) FROM (SELECT *, ROW_NUMBER() OVER (PARTITION BY " ), partition, SQL_ORDERBY, order_by, SQL(") AS _rank FROM "), quote_column(self.full_name), SQL(") a WHERE _rank=1"), ))
def sql_query(command): """ VERY BASIC QUERY EXPRESSION TO SQL :param command: jx-expression :return: SQL """ command = wrap(command) acc = [SQL_SELECT] if command.select: acc.append( JoinSQL(SQL_COMMA, map(quote_column, listwrap(command.select)))) else: acc.append(SQL_STAR) acc.append(SQL_FROM) acc.append(quote_column(command["from"])) if command.where: acc.append(SQL_WHERE) if command.where.eq: acc.append(sql_eq(**command.where.eq)) else: where = esfilter2sqlwhere(command.where) acc.append(where) sort = coalesce(command.orderby, command.sort) if sort: acc.append(SQL_ORDERBY) acc.append(JoinSQL(SQL_COMMA, map(quote_column, listwrap(sort)))) if command.limit: acc.append(SQL_LIMIT) acc.append( JoinSQL(SQL_COMMA, map(quote_value, listwrap(command.limit)))) return ConcatSQL(*acc)
def to_sql(self, schema, not_null=False, boolean=False): lhs = SQLang[self.lhs].partial_eval() rhs = SQLang[self.rhs].partial_eval() lhs_sql = lhs.to_sql(schema, not_null=True) rhs_sql = rhs.to_sql(schema, not_null=True) if is_literal(rhs) and lhs_sql[0].sql.b != None and rhs.value in ("T", "F"): rhs_sql = BooleanOp(rhs).to_sql(schema) if is_literal(lhs) and rhs_sql[0].sql.b != None and lhs.value in ("T", "F"): lhs_sql = BooleanOp(lhs).to_sql(schema) if len(lhs_sql) != len(rhs_sql): Log.error("lhs and rhs have different dimensionality!?") acc = [] for l, r in zip(lhs_sql, rhs_sql): for t in "bsnj": if r.sql[t] == None: if l.sql[t] == None: pass else: acc.append(ConcatSQL(l.sql[t], SQL_IS_NULL)) elif l.sql[t] == None: acc.append(ConcatSQL(r.sql[t], SQL_IS_NULL)) else: acc.append( ConcatSQL(sql_iso(l.sql[t]), SQL_EQ, sql_iso(r.sql[t]))) if not acc: return FALSE.to_sql(schema) else: return wrap([{"name": ".", "sql": {"b": JoinSQL(SQL_OR, acc)}}])
def to_sql(self, schema, not_null=False, boolean=False): return wrap([{ "name": ".", "sql": { "b": JoinSQL( SQL_OR, [ sql_iso(SQLang[t].to_sql( schema, boolean=True)[0].sql.b) for t in self.terms ], ) }, }])
def quote_column(*path): if DEBUG: if not path: Log.error("expecting a name") for p in path: if not is_text(p): Log.error("expecting strings, not {{type}}", type=p.__class__.__name__) try: output = ConcatSQL(SQL_SPACE, JoinSQL(SQL_DOT, [SQL(quote(p)) for p in path]), SQL_SPACE) return output except Exception as e: Log.error("Not expacted", cause=e)
def sql_query(command): """ VERY BASIC QUERY EXPRESSION TO SQL :param command: jx-expression :return: SQL """ command = wrap(command) acc = [SQL_SELECT] if command.select: acc.append( JoinSQL(SQL_COMMA, map(quote_column, listwrap(command.select)))) else: acc.append(SQL_STAR) acc.append(SQL_FROM) acc.append(quote_column(command["from"])) if command.where.eq: acc.append(SQL_WHERE) acc.append(sql_eq(**command.where.eq)) if command.orderby: acc.append(SQL_ORDERBY) acc.append( JoinSQL(SQL_COMMA, map(quote_column, listwrap(command.orderby)))) return ConcatSQL(*acc)
def where(self, filter): """ WILL NOT PULL WHOLE OBJECT, JUST TOP-LEVEL PROPERTIES :param filter: jx_expression filter :return: list of objects that match """ select = [] column_names = [] for c in self.schema.columns: if c.jx_type in STRUCT: continue if len(c.nested_path) != 1: continue column_names.append(c.name) select.append(sql_alias(quote_column(c.es_column), c.name)) where_sql = SQLang[jx_expression(filter)].to_sql(self.schema)[0].sql.b result = self.db.query(ConcatSQL( SQL_SELECT, JoinSQL(SQL_COMMA, select), SQL_FROM, quote_column(self.snowflake.fact_name), SQL_WHERE, where_sql )) return wrap([{c: v for c, v in zip(column_names, r)} for r in result.data])
def sql_call(func_name, *parameters): return ConcatSQL(SQL(func_name), sql_iso(JoinSQL(SQL_COMMA, parameters)))
def quote_column(*path): if not path: Log.error("missing column_name") if len(path) == 1: return SQL("`" + path[0].replace('`', '``') + "`") return JoinSQL(SQL_DOT, map(quote_column, path))
def clean(self): """ REMOVE ANY RECORDS THAT ARE NOT NEEDED BY QUEUE OR SUBSCRIBERS """ now = Date.now() # ANY BLOCKS TO FLUSH? with self.db.transaction() as t: result = t.query( SQL( f""" SELECT id, block_size_mb, block_start FROM {QUEUE} AS q JOIN {BLOCKS} AS b ON b.queue=q.id AND b.serial=q.block_start WHERE b.last_used < {quote_value(now-Duration(WRITE_INTERVAL))} """ ) ) for stale in rows(result): queue = first(q for q in self.queues if q.id == stale.id) queue._flush(**stale) # REMOVE UNREACHABLE MESSAGES conditions = [] for q in self.queues: conditions.append( SQL(f"(queue = {quote_value(q.id)} AND serial IN (") + SQL( f""" SELECT m.serial FROM {MESSAGES} AS m LEFT JOIN {UNCONFIRMED} as u ON u.serial = m.serial LEFT JOIN {SUBSCRIBER} as s ON s.queue = m.queue and s.id = u.subscriber LEFT JOIN {QUEUE} as q ON q.id = m.queue and m.serial >= q.block_start LEFT JOIN {SUBSCRIBER} as la ON la.queue = m.queue AND la.last_confirmed_serial < m.serial AND m.serial < la.next_emit_serial+la.look_ahead_serial WHERE m.queue = {q.id} AND s.id IS NULL AND -- STILL UNCONFIRMED POP q.id IS NULL AND -- NOT WRITTEN TO S3 YET la.id IS NULL -- NOT IN LOOK-AHEAD FOR SUBSCRIBER """ ) + SQL("))") ) with self.db.transaction() as t: if DEBUG: result = t.query( ConcatSQL( SQL(f"SELECT count(1) AS `count` FROM {MESSAGES} WHERE "), JoinSQL(SQL_OR, conditions), ) ) Log.note( "Delete {{num}} messages from database", num=first_row(result).count ) t.execute( ConcatSQL( SQL(f"DELETE FROM {MESSAGES} WHERE "), JoinSQL(SQL_OR, conditions) ) )
def _gen_select(source_path, source_tops, source_flake, total_path, total_tops, total_flake): if total_flake == source_flake and not total_tops: return [ quote_column(source_path + escape_name(k)) for k in jx.sort(total_flake.keys()) ] if NESTED_TYPE in total_flake: # PROMOTE EVERYTHING TO REPEATED v = source_flake.get(NESTED_TYPE) t = total_flake.get(NESTED_TYPE) if not v: # CONVERT INNER OBJECT TO ARRAY OF ONE STRUCT inner = [ ConcatSQL( SQL_SELECT_AS_STRUCT, JoinSQL( ConcatSQL(SQL_COMMA, SQL_CR), _gen_select( source_path, Null, source_flake, total_path + REPEATED, Null, t, ), ), ) ] else: row_name = "row" + text(len(source_path.values)) ord_name = "ordering" + text(len(source_path.values)) inner = [ ConcatSQL( SQL_SELECT_AS_STRUCT, JoinSQL( ConcatSQL(SQL_COMMA, SQL_CR), _gen_select(ApiName(row_name), Null, v, ApiName(row_name), Null, t), ), SQL_FROM, sql_call("UNNEST", quote_column(source_path + REPEATED)), SQL_AS, SQL(row_name), SQL(" WITH OFFSET AS "), SQL(ord_name), SQL_ORDERBY, SQL(ord_name), ) ] return [sql_alias(sql_call("ARRAY", *inner), REPEATED)] selection = [] for k, t in jx.sort(total_flake.items(), 0): k_total_tops = total_tops if is_text(total_tops) else total_tops[k] k_tops = source_tops if is_text(source_tops) else source_tops[k] v = source_flake.get(k) if is_text(k_total_tops): # DO NOT INCLUDE TOP_LEVEL_FIELDS pass elif t == v and not k_total_tops and not k_tops: selection.append( ConcatSQL( quote_column(source_path + escape_name(k)), SQL_AS, quote_column(escape_name(k)), )) elif is_data(t): if not v: selects = _gen_select( source_path + escape_name(k), source_tops, {}, total_path + escape_name(k), k_total_tops, t, ) elif is_data(v): selects = _gen_select( source_path + escape_name(k), source_tops, v, total_path + escape_name(k), k_total_tops, t, ) else: raise Log.error( "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}", field=join_field(source_path + escape_name(k)), type=v, main=t, ) if selects: inner = [ ConcatSQL( SQL_SELECT_AS_STRUCT, JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), selects), ) ] selection.append( sql_alias(sql_call("", *inner), escape_name(k))) elif is_text(t): if is_text(k_tops): # THE SOURCE HAS THIS PROPERTY AS A TOP_LEVEL_FIELD selection.append( ConcatSQL(SQL(k_tops), SQL_AS, quote_column(escape_name(k)))) elif v == t: selection.append( ConcatSQL( quote_column(total_path + escape_name(k)), SQL_AS, quote_column(escape_name(k)), )) else: if v: Log.note( "Datatype mismatch on {{field}}: Can not merge {{type}} into {{main}}", field=join_field(source_path + escape_name(k)), type=v, main=t, ) selection.append( ConcatSQL( sql_call( "CAST", ConcatSQL(SQL_NULL, SQL_AS, SQL(json_type_to_bq_type[t])), ), SQL_AS, quote_column(escape_name(k)), )) else: Log.error("not expected") return selection
def merge_shards(self): shards = [] tables = list(self.container.client.list_tables( self.container.dataset)) current_view = Null # VIEW THAT POINTS TO PRIMARY SHARD primary_shard_name = None # PRIMARY SHARD api_name = escape_name(self.short_name) for table_item in tables: table = table_item.reference table_api_name = ApiName(table.table_id) if text(table_api_name).startswith(text(api_name)): if table_api_name == api_name: if table_item.table_type != "VIEW": Log.error("expecting {{table}} to be a view", table=api_name) current_view = self.container.client.get_table(table) view_sql = current_view.view_query primary_shard_name = _extract_primary_shard_name(view_sql) elif SUFFIX_PATTERN.match( text(table_api_name)[len(text(api_name)):]): try: known_table = self.container.client.get_table(table) shards.append(known_table) except Exception as e: Log.warning("could not merge table {{table}}", table=table, cause=e) if not current_view: Log.error("expecting {{table}} to be a view pointing to a table", table=api_name) shard_flakes = [ Snowflake.parse( big_query_schema=shard.schema, es_index=text(self.container.full_name + ApiName(shard.table_id)), top_level_fields=self.top_level_fields, partition=self.partition, ) for shard in shards ] total_flake = snowflakes.merge( shard_flakes, es_index=text(self.full_name), top_level_fields=self.top_level_fields, partition=self.partition, ) for i, s in enumerate(shards): if ApiName(s.table_id) == primary_shard_name: if total_flake == shard_flakes[i]: # USE THE CURRENT PRIMARY SHARD AS A DESTINATION del shards[i] del shard_flakes[i] break else: name = self.short_name + "_" + "".join(Random.sample(ALLOWED, 20)) primary_shard_name = escape_name(name) self.container.create_table( table=name, schema=total_flake.schema, sharded=False, read_only=False, kwargs=self.config, ) primary_full_name = self.container.full_name + primary_shard_name selects = [] for flake, table in zip(shard_flakes, shards): q = ConcatSQL( SQL_SELECT, JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), gen_select(total_flake, flake)), SQL_FROM, quote_column(ApiName(table.dataset_id, table.table_id)), ) selects.append(q) Log.note("inserting into table {{table}}", table=text(primary_shard_name)) matched = [] unmatched = [] for sel, shard, flake in zip(selects, shards, shard_flakes): if flake == total_flake: matched.append((sel, shard, flake)) else: unmatched.append((sel, shard, flake)) # EVERYTHING THAT IS IDENTICAL TO PRIMARY CAN BE MERGED WITH SIMPLE UNION ALL if matched: for g, merge_chunk in jx.chunk(matched, MAX_MERGE): command = ConcatSQL( SQL_INSERT, quote_column(primary_full_name), JoinSQL( SQL_UNION_ALL, (sql_query({ "from": self.container.full_name + ApiName(shard.table_id) }) for _, shard, _ in merge_chunk), ), ) DEBUG and Log.note("{{sql}}", sql=text(command)) job = self.container.query_and_wait(command) Log.note("job {{id}} state = {{state}}", id=job.job_id, state=job.state) if job.errors: Log.error( "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}", sql=command.sql, reason=job.errors, ) for _, shard, _ in merge_chunk: self.container.client.delete_table(shard) # ALL OTHER SCHEMAS MISMATCH for s, shard, _ in unmatched: try: command = ConcatSQL(SQL_INSERT, quote_column(primary_full_name), s) DEBUG and Log.note("{{sql}}", sql=text(command)) job = self.container.query_and_wait(command) Log.note( "from {{shard}}, job {{id}}, state {{state}}", id=job.job_id, shard=shard.table_id, state=job.state, ) if job.errors: if all(" does not have a schema." in m for m in wrap(job.errors).message): pass # NOTHING TO DO else: Log.error( "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}", sql=command.sql, reason=job.errors, ) self.container.client.delete_table(shard) except Exception as e: Log.warning("failure to merge {{shard}}", shard=shard, cause=e) # REMOVE OLD VIEW view_full_name = self.container.full_name + api_name if current_view: self.container.client.delete_table(current_view) # CREATE NEW VIEW self.container.create_view(view_full_name, primary_full_name)