def create_table_sql(self, db): name = self.__class__.__name__ if self.replica_name: name = 'Replicated' + name # In ClickHouse 1.1.54310 custom partitioning key was introduced # https://clickhouse.yandex/docs/en/table_engines/custom_partitioning_key/ # Let's check version and use new syntax if available if db.server_version >= (1, 1, 54310): partition_sql = "PARTITION BY %s ORDER BY %s" \ % ('(%s)' % comma_join(self.partition_key), '(%s)' % comma_join(self.order_by)) if self.sampling_expr: partition_sql += " SAMPLE BY %s" % self.sampling_expr partition_sql += " SETTINGS index_granularity=%d" % self.index_granularity elif not self.date_col: # Can't import it globally due to circular import from infi.clickhouse_orm.database import DatabaseException raise DatabaseException("Custom partitioning is not supported before ClickHouse 1.1.54310. " "Please update your server or use date_col syntax." "https://clickhouse.yandex/docs/en/table_engines/custom_partitioning_key/") else: partition_sql = '' params = self._build_sql_params(db) return '%s(%s) %s' % (name, comma_join(params), partition_sql)
def _init(self, db_name, db_url='http://localhost:8123/', username=None, password=None, readonly=False, autocreate=True, timeout=60, verify_ssl_cert=True, ssl_cert=None, log_statements=False): ''' Initializes a database instance. Unless it's readonly, the database will be created on the ClickHouse server if it does not already exist. - `db_name`: name of the database to connect to. - `db_url`: URL of the ClickHouse server. - `username`: optional connection credentials. - `password`: optional connection credentials. - `readonly`: use a read-only connection. - `autocreate`: automatically create the database if it does not exist (unless in readonly mode). - `timeout`: the connection timeout in seconds. - `verify_ssl_cert`: whether to verify the server's certificate when connecting via HTTPS. - `ssl_cert`: certificate and key when connecting via HTTPS. - `log_statements`: when True, all database statements are logged. ''' self.username = username self.password = password self.db_name = db_name self.db_url = db_url self.readonly = False self.timeout = timeout self.request_session = requests.Session() self.request_session.verify = verify_ssl_cert self.request_session.cert = ssl_cert if username: self.request_session.auth = (username, password or '') self.log_statements = log_statements self.settings = {} self.db_exists = False self.db_exists = self._is_existing_database() if readonly: if not self.db_exists: raise DatabaseException( 'Database does not exist, and cannot be created under readonly connection' ) self.connection_readonly = self._is_connection_readonly() self.readonly = True elif autocreate and not self.db_exists: self.create_database() self.server_version = self._get_server_version() # Versions 1.1.53981 and below don't have timezone function self.server_timezone = (self._get_server_timezone() if self.server_version > (1, 1, 53981) else pytz.utc) # Versions 19.1.16 and above support codec compression self.has_codec_support = self.server_version >= (19, 1, 16) # Version 19.0 and above support LowCardinality self.has_low_cardinality_support = self.server_version >= (19, 0)
async def insert_async(self, model_instances, batch_size=1000): ''' Insert records into the database. - `model_instances`: any iterable containing instances of a single model class. - `batch_size`: number of records to send per chunk (use a lower number if your records are very large). ''' from six import next i = iter(model_instances) try: first_instance = next(i) except StopIteration: return # model_instances is empty first_instance.set_database(self) model_class = first_instance.__class__ if first_instance.is_read_only() or first_instance.is_system_model(): raise DatabaseException( "You can't insert into read only and system tables") fields_list = ','.join( ['`%s`' % name for name in first_instance.fields(writable=True)]) def gen(): values = list() values.append(first_instance.to_dict(include_readonly=False)) # Collect lines in batches of batch_size lines = 2 for instance in i: instance.set_database(self) values.append(instance.to_dict(include_readonly=False)) lines += 1 if lines >= batch_size: # Return the current batch of lines yield values # Start a new batch values = list() lines = 0 # Return any remaining lines in partial batch if lines: yield values for butch in gen(): query = self._substitute( 'INSERT INTO $table (%s) VALUES ' % fields_list, model_class) await self._send(query, settings=butch)
def insert_tuples(self, model_class: Type['ClickHouseModel'], model_tuples: Iterable[tuple], batch_size: Optional[int] = None, formatted: bool = False) -> None: """ Inserts model_class namedtuples :param model_class: ClickHouse model, namedtuples are made from :param model_tuples: An iterable of tuples to insert :param batch_size: Size of batch :param formatted: If flag is set, tuples are expected to be ready to insert without calling field.to_db_string :return: None """ tuples_iterator = iter(model_tuples) try: first_tuple = next(tuples_iterator) except StopIteration: return # model_instances is empty if model_class.is_read_only() or model_class.is_system_model(): raise DatabaseException( "You can't insert into read only and system tables") fields_list = ','.join('`%s`' % name for name in first_tuple._fields) fields_dict = model_class.fields(writable=True) statsd_key = "%s.inserted_tuples.%s" % (config.STATSD_PREFIX, model_class.__name__) query = 'INSERT INTO `%s`.`%s` (%s) FORMAT TabSeparated\n' \ % (self.db_name, model_class.table_name(), fields_list) query_enc = query.encode('utf-8') def tuple_to_csv(tup): if formatted: str_gen = (getattr(tup, field_name) for field_name in first_tuple._fields) else: str_gen = (fields_dict[field_name].to_db_string(getattr( tup, field_name), quote=False) for field_name in first_tuple._fields) return '%s\n' % '\t'.join(str_gen) def gen(): buf = BytesIO() buf.write(query_enc) buf.write(tuple_to_csv(first_tuple).encode('utf-8')) # Collect lines in batches of batch_size lines = 1 for t in tuples_iterator: buf.write(tuple_to_csv(t).encode('utf-8')) lines += 1 if batch_size is not None and lines >= batch_size: # Return the current batch of lines statsd.incr(statsd_key, lines) yield buf.getvalue() # Start a new batch buf = BytesIO() buf.write(query_enc) lines = 0 # Return any remaining lines in partial batch if lines: statsd.incr(statsd_key, lines) yield buf.getvalue() # For testing purposes for data in gen(): with statsd.timer(statsd_key): logger.debug('django-clickhouse: insert tuple: %s' % data) self._send(data)