def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds kwargs.host = Random.sample(listwrap(host), 1)[0] schema = json2value(value2json(SCHEMA), leaves=True) schema.mappings[type].properties["~N~"].type = "nested" self.es = Cluster(kwargs).get_or_create_index( schema=schema, limit_replicas=True, typed=True, kwargs=kwargs, ) self.batch_size = batch_size self.es.add_alias(coalesce(kwargs.alias, kwargs.index)) self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop)
def _create_new_shard(self): primary_shard = self.container.create_table( table=self.short_name + "_" + "".join(Random.sample(ALLOWED, 20)), sharded=False, schema=self._flake.schema, kwargs=self.config, ) self.shard = primary_shard.shard
def __init__( self, host, index, port=9200, type="log", queue_size=1000, batch_size=100, kwargs=None, ): """ settings ARE FOR THE ELASTICSEARCH INDEX """ kwargs.timeout = Duration(coalesce(kwargs.timeout, "30second")).seconds kwargs.retry.times = coalesce(kwargs.retry.times, 3) kwargs.retry.sleep = Duration(coalesce(kwargs.retry.sleep, MINUTE)).seconds kwargs.host = Random.sample(listwrap(host), 1)[0] rollover_interval = coalesce(kwargs.rollover.interval, kwargs.rollover.max, "year") rollover_max = coalesce(kwargs.rollover.max, kwargs.rollover.interval, "year") schema = set_default(kwargs.schema, { "mappings": { kwargs.type: { "properties": { "~N~": { "type": "nested" } } } } }, json2value(value2json(SCHEMA), leaves=True)) self.es = RolloverIndex( rollover_field={"get": [{ "first": "." }, { "literal": "timestamp" }]}, rollover_interval=rollover_interval, rollover_max=rollover_max, schema=schema, limit_replicas=True, typed=True, read_only=False, kwargs=kwargs, ) self.batch_size = batch_size self.queue = Queue("debug logs to es", max=queue_size, silent=True) self.worker = Thread.run("add debug logs to es", self._insert_loop)
def test_id_vs_id(self): ops = [Op() for _ in range(200)] lang1 = {id(o): o for o in ops} sample = Random.sample(ops, 1000 * 1000) with Timer("using id()"): result1 = [lang1[id(o)] for o in sample] lang2 = [None] * (max(o.id for o in ops) + 1) for o in ops: lang2[o.id] = o # lang2 = tuple(lang2) with Timer("using o.id"): result2 = [lang2[o.id] for o in sample]
def merge_shards(self): shards = [] tables = list(self.container.client.list_tables( self.container.dataset)) current_view = Null # VIEW THAT POINTS TO PRIMARY SHARD primary_shard_name = None # PRIMARY SHARD api_name = escape_name(self.short_name) for table_item in tables: table = table_item.reference table_api_name = ApiName(table.table_id) if text(table_api_name).startswith(text(api_name)): if table_api_name == api_name: if table_item.table_type != "VIEW": Log.error("expecting {{table}} to be a view", table=api_name) current_view = self.container.client.get_table(table) view_sql = current_view.view_query primary_shard_name = _extract_primary_shard_name(view_sql) elif SUFFIX_PATTERN.match( text(table_api_name)[len(text(api_name)):]): try: known_table = self.container.client.get_table(table) shards.append(known_table) except Exception as e: Log.warning("could not merge table {{table}}", table=table, cause=e) if not current_view: Log.error("expecting {{table}} to be a view pointing to a table", table=api_name) shard_flakes = [ Snowflake.parse( big_query_schema=shard.schema, es_index=text(self.container.full_name + ApiName(shard.table_id)), top_level_fields=self.top_level_fields, partition=self.partition, ) for shard in shards ] total_flake = snowflakes.merge( shard_flakes, es_index=text(self.full_name), top_level_fields=self.top_level_fields, partition=self.partition, ) for i, s in enumerate(shards): if ApiName(s.table_id) == primary_shard_name: if total_flake == shard_flakes[i]: # USE THE CURRENT PRIMARY SHARD AS A DESTINATION del shards[i] del shard_flakes[i] break else: name = self.short_name + "_" + "".join(Random.sample(ALLOWED, 20)) primary_shard_name = escape_name(name) self.container.create_table( table=name, schema=total_flake.schema, sharded=False, read_only=False, kwargs=self.config, ) primary_full_name = self.container.full_name + primary_shard_name selects = [] for flake, table in zip(shard_flakes, shards): q = ConcatSQL( SQL_SELECT, JoinSQL(ConcatSQL(SQL_COMMA, SQL_CR), gen_select(total_flake, flake)), SQL_FROM, quote_column(ApiName(table.dataset_id, table.table_id)), ) selects.append(q) DEBUG and Log.note("inserting into table {{table}}", table=text(primary_shard_name)) matched = [] unmatched = [] for sel, shard, flake in zip(selects, shards, shard_flakes): if flake == total_flake: matched.append((sel, shard, flake)) else: unmatched.append((sel, shard, flake)) # EVERYTHING THAT IS IDENTICAL TO PRIMARY CAN BE MERGED WITH SIMPLE UNION ALL if matched: for g, merge_chunk in jx.chunk(matched, MAX_MERGE): command = ConcatSQL( SQL_INSERT, quote_column(primary_full_name), JoinSQL( SQL_UNION_ALL, (sql_query( { "from": text(self.container.full_name + ApiName(shard.table_id)) }, schema, ) for _, shard, schema in merge_chunk), ), ) DEBUG and Log.note("{{sql}}", sql=text(command)) job = self.container.query_and_wait(command) DEBUG and Log.note("job {{id}} state = {{state}}", id=job.job_id, state=job.state) if job.errors: Log.error( "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}", sql=command.sql, reason=job.errors, ) for _, shard, _ in merge_chunk: self.container.client.delete_table(shard) # ALL OTHER SCHEMAS MISMATCH for s, shard, _ in unmatched: try: command = ConcatSQL(SQL_INSERT, quote_column(primary_full_name), s) DEBUG and Log.note("{{sql}}", sql=text(command)) job = self.container.query_and_wait(command) DEBUG and Log.note( "from {{shard}}, job {{id}}, state {{state}}", id=job.job_id, shard=shard.table_id, state=job.state, ) if job.errors: if all(" does not have a schema." in m for m in wrap(job.errors).message): pass # NOTHING TO DO else: Log.error( "\n{{sql}}\nDid not fill table:\n{{reason|json|indent}}", sql=command.sql, reason=job.errors, ) self.container.client.delete_table(shard) except Exception as e: Log.warning("failure to merge {{shard}}", shard=shard, cause=e) # REMOVE OLD VIEW view_full_name = self.container.full_name + api_name if current_view: self.container.client.delete_table(current_view) # CREATE NEW VIEW self.container.create_view(view_full_name, primary_full_name)
def create_table( self, table, schema=None, typed=True, read_only=True, # TO PREVENT ACCIDENTAL WRITING sharded=False, partition=Null, # PARTITION RULES cluster=None, # TUPLE OF FIELDS TO SORT DATA top_level_fields=Null, kwargs=None, ): if kwargs.lookup != None or kwargs.flake != None: Log.error("expecting schema, not lookup") full_name = self.full_name + escape_name(table) if not schema: # WE MUST HAVE SOMETHING if typed: schema = copy(DEFAULT_TYPED_SCHEMA) else: schema = copy(DEFAULT_SCHEMA) flake = Snowflake(text(full_name), top_level_fields, partition, schema=schema) if read_only: Log.error("Can not create a table for read-only use") if sharded: shard_name = escape_name(table + "_" + "".join(Random.sample(ALLOWED, 20))) shard_api_name = self.full_name + shard_name _shard = bigquery.Table(text(shard_api_name), schema=flake.to_bq_schema()) _shard.time_partitioning = unwrap( flake._partition.bq_time_partitioning) _shard.clustering_fields = [ c.es_column for f in listwrap(cluster) for c in [first(flake.leaves(f))] if c ] or None self.shard = self.client.create_table(_shard) self.create_view(full_name, shard_api_name) else: _table = bigquery.Table(text(full_name), schema=flake.to_bq_schema()) _table.time_partitioning = unwrap( flake._partition.bq_time_partitioning) _table.clustering_fields = [ l.es_column for f in listwrap(cluster) for l in flake.leaves(f) ] or None self.client.create_table(_table) DEBUG and Log.note("created table {{table}}", table=_table.table_id) return Table( table=table, typed=typed, read_only=read_only, sharded=sharded, partition=partition, top_level_fields=top_level_fields, kwargs=kwargs, container=self, )