def create_table( self, table, schema=None, typed=True, read_only=True, # TO PREVENT ACCIDENTAL WRITING sharded=False, partition=Null, # PARTITION RULES cluster=None, # TUPLE OF FIELDS TO SORT DATA top_level_fields=Null, kwargs=None, ): if kwargs.lookup != None or kwargs.flake != None: Log.error("expecting schema, not lookup") full_name = self.full_name + escape_name(table) if not schema: # WE MUST HAVE SOMETHING if typed: schema = copy(DEFAULT_TYPED_SCHEMA) else: schema = copy(DEFAULT_SCHEMA) flake = Snowflake(text(full_name), top_level_fields, partition, schema=schema) if read_only: Log.error("Can not create a table for read-only use") if sharded: shard_name = escape_name(table + "_" + "".join(randoms.sample(ALLOWED, 20))) shard_api_name = self.full_name + shard_name _shard = bigquery.Table(text(shard_api_name), schema=flake.to_bq_schema()) _shard.time_partitioning = unwrap(flake._partition.bq_time_partitioning) _shard.clustering_fields = [ c.es_column for f in listwrap(cluster) for c in [first(flake.leaves(f))] if c ] or None self.shard = self.client.create_table(_shard) self.create_view(full_name, shard_api_name) else: _table = bigquery.Table(text(full_name), schema=flake.to_bq_schema()) _table.time_partitioning = unwrap(flake._partition.bq_time_partitioning) _table.clustering_fields = [ l.es_column for f in listwrap(cluster) for l in flake.leaves(f) ] or None self.client.create_table(_table) DEBUG and Log.note("created table {{table}}", table=_table.table_id) return Table( table=table, typed=typed, read_only=read_only, sharded=sharded, partition=partition, top_level_fields=top_level_fields, kwargs=kwargs, container=self, )
def merge_shards(self): shards = [] tables = list(self.container.client.list_tables(self.container.dataset)) current_view = Null # VIEW THAT POINTS TO PRIMARY SHARD primary_shard_name = None # PRIMARY SHARD api_name = escape_name(self.short_name) for table_item in tables: table = table_item.reference table_api_name = ApiName(table.table_id) if text(table_api_name).startswith(text(api_name)): if table_api_name == api_name: if table_item.table_type != "VIEW": Log.error("expecting {{table}} to be a view", table=api_name) current_view = self.container.client.get_table(table) view_sql = current_view.view_query primary_shard_name = _extract_primary_shard_name(view_sql) elif SUFFIX_PATTERN.match(text(table_api_name)[len(text(api_name)) :]): try: known_table = self.container.client.get_table(table) shards.append(known_table) except Exception as e: Log.warning( "could not merge table {{table}}", table=table, cause=e ) if not current_view: Log.error( "expecting {{table}} to be a view pointing to a table", table=api_name ) shard_flakes = [ Snowflake.parse( big_query_schema=shard.schema, es_index=text(self.container.full_name + ApiName(shard.table_id)), top_level_fields=self.top_level_fields, partition=self.partition, ) for shard in shards ] total_flake = snowflakes.merge( shard_flakes, es_index=text(self.full_name), top_level_fields=self.top_level_fields, partition=self.partition, ) for i, s in enumerate(shards): if ApiName(s.table_id) == primary_shard_name: if total_flake == shard_flakes[i]: # USE THE CURRENT PRIMARY SHARD AS A DESTINATION del shards[i] del shard_flakes[i] break else: name = self.short_name + "_" + "".join(randoms.sample(ALLOWED, 20)) primary_shard_name = escape_name(name) self.container.create_table( table=name, schema=total_flake.schema, sharded=False, read_only=False, kwargs=self.config, ) primary_full_name = self.container.full_name + primary_shard_name selects = [] for flake, table in zip(shard_flakes, shards): q = ConcatSQL( SQL_SELECT, JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), gen_select(total_flake, flake)), SQL_FROM, quote_column(ApiName(table.dataset_id, table.table_id)), ) selects.append(q) DEBUG and Log.note( "inserting into table {{table}}", table=text(primary_shard_name) ) matched = [] unmatched = [] for sel, shard, flake in zip(selects, shards, shard_flakes): if flake == total_flake: matched.append((sel, shard, flake)) else: unmatched.append((sel, shard, flake)) # EVERYTHING THAT IS IDENTICAL TO PRIMARY CAN BE MERGED WITH SIMPLE UNION ALL if matched: for g, merge_chunk in jx.chunk(matched, MAX_MERGE): command = ConcatSQL( SQL_INSERT, quote_column(primary_full_name), JoinSQL( SQL_UNION_ALL, ( sql_query( { "from": text( self.container.full_name + ApiName(shard.table_id) ) }, schema, ) for _, shard, schema in merge_chunk ), ), ) DEBUG and Log.note("{{sql}}", sql=text(command)) job = self.container.query_and_wait(command) DEBUG and Log.note( "job {{id}} state = {{state}}", id=job.job_id, state=job.state ) if job.errors: Log.error( "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}", sql=command.sql, reason=job.errors, ) for _, shard, _ in merge_chunk: self.container.client.delete_table(shard) # ALL OTHER SCHEMAS MISMATCH for s, shard, _ in unmatched: try: command = ConcatSQL(SQL_INSERT, quote_column(primary_full_name), s) DEBUG and Log.note("{{sql}}", sql=text(command)) job = self.container.query_and_wait(command) DEBUG and Log.note( "from {{shard}}, job {{id}}, state {{state}}", id=job.job_id, shard=shard.table_id, state=job.state, ) if job.errors: if all( " does not have a schema." in m for m in wrap(job.errors).message ): pass # NOTHING TO DO else: Log.error( "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}", sql=command.sql, reason=job.errors, ) self.container.client.delete_table(shard) except Exception as e: Log.warning("failure to merge {{shard}}", shard=shard, cause=e) # REMOVE OLD VIEW view_full_name = self.container.full_name + api_name if current_view: self.container.client.delete_table(current_view) # CREATE NEW VIEW self.container.create_view(view_full_name, primary_full_name)
def __init__( self, table, typed, read_only, sharded, container, id=Null, partition=Null, cluster=Null, top_level_fields=Null, kwargs=None, ): self.short_name = table self.typed = typed self.read_only = read_only self.cluster = cluster self.id = id self.top_level_fields = top_level_fields self.config = Data( # USED TO REPLICATE THIS typed=typed, read_only=read_only, sharded=sharded, id=id, partition=partition, cluster=cluster, top_level_fields=top_level_fields, ) esc_name = escape_name(table) self.full_name = container.full_name + esc_name self.alias_view = alias_view = container.client.get_table(text(self.full_name)) self.partition = partition self.container = container if not sharded: if not read_only and alias_view.table_type == "VIEW": Log.error("Expecting a table, not a view") self.shard = alias_view self._flake = Snowflake.parse( alias_view.schema, text(self.full_name), self.top_level_fields, partition, ) else: if alias_view.table_type != "VIEW": Log.error("Sharded tables require a view") current_view = container.client.get_table(text(self.full_name)) view_sql = current_view.view_query shard_name = _extract_primary_shard_name(view_sql) try: self.shard = container.client.get_table( text(container.full_name + shard_name) ) self._flake = Snowflake.parse( alias_view.schema, text(self.full_name), self.top_level_fields, partition, ) except Exception as e: Log.warning("view {{name}} is invalid", name=shard_name, cause=e) self._flake = Snowflake.parse( alias_view.schema, text(self.full_name), self.top_level_fields, partition, ) # REMOVE STALE VIEW container.client.delete_table(current_view) # MAKE NEW VIEW POINTING TO NEW SHARD self._create_new_shard() container.create_view( self.full_name, self.container.full_name + ApiName(self.shard.table_id), ) self.last_extend = Date.now() - EXTEND_LIMIT self.extend_locker = Lock() self.extend_queue = Queue("wait for extend")