def delete_data_in_ch(self): client = Client(**CLICKHOUSE_DB_INFO) delete_data_last = {} if not self.delete_data_list: self.time_cost_last_delete = 0 else: logger.info( "begin group data with primary_key_values to each table") for i in self.delete_data_list: delete_data_last.setdefault(i['table_name'], []).append(i['primary_key_values']) logger.info("group each table data for one sql") for table_name, values in delete_data_last.items(): sql_cur = {} for v in values: for k, vv in v.items(): sql_cur.setdefault(k, []).append(vv) last_delete = ' and '.join( ['%s in %s' % (k, tuple(v)) for k, v in sql_cur.items()]) exec_sql = "alter table %s delete where %s ;" % (table_name, last_delete) begin_delete_time = int(time.time()) logger.info("begin execute: exec_sql:%s" % exec_sql) client.execute(exec_sql) logger.info("end execute") client.disconnect() self.time_cost_last_delete = int( time.time()) - begin_delete_time
def push(self, client: Client = None, key: str = None, close: bool = False): """Return a client to the pool for reuse. Args: client: The client to return. key: If known, the key of the client. close: Close the client instead of adding back to pool. """ self._lock.acquire() try: if self.closed: raise ChPoolError("pool closed") if key is None: key = self._rused.get(id(client)) if key is None: raise ChPoolError("trying to put unkeyed client") if len(self._pool) < self.connections_min and not close: # TODO: verify connection still valid if client.connection.connected: self._pool.append(client) else: client.disconnect() # ensure thread doesn't put connection back once the pool is closed if not self.closed or key in self._used: del self._used[key] del self._rused[id(client)] finally: self._lock.release()
class DBConnection: def __init__(self, clickhouse_host=None, clickhouse_name=None, dbuser=None, dbpass=None, db_query=None): self.db_query = db_query self.clickhouse_client = Client( host=clickhouse_host, database=clickhouse_name, user=dbuser, password=dbpass ) try_conn = self._connection_test() if try_conn: raise try_conn self.connection_status = False def make_connection(self): try: self.clickhouse_client.connection.connect() except Exception as error: logger.error(error) return False self.connection_status = True return True def _connection_test(self): try: self.clickhouse_client.connection.connect() except errors.Error as error: logger.error(exception_to_logger(error)) return "Error of database connection." self.clickhouse_client.disconnect() return None def send_request(self, trace=False): response_status = False try: if isinstance(self.db_query, tuple): response = self.clickhouse_client.execute( self.db_query[0], self.db_query[1] ) else: response = self.clickhouse_client.execute(self.db_query) response_status = True except Exception as error: logger.error(exception_to_logger(error)) response = json_str_error("Error in sql query!") if trace: print_statusbar([(error, 'd',)]) return response_status, response def disconnect(self): self.clickhouse_client.disconnect() self.connection_status = False
def insert_to_ch(schema, table, res, start_time=None, end_time=None): if not res: logger.info(f"None Result, pass: {schema}.{table} {start_time}, {end_time}") return ind = random.randint(0, len(CH_SERVERS) - 1) host, port = CH_SERVERS[ind][0], CH_SERVERS[ind][1] if end_time: distributed_table = table.rstrip('_local') client = Client(host=host, port=port, user=CH_USER, password=CH_PASS) s = client.execute(f"select count(*) cnt from " f"{schema}.{distributed_table} where end_time=toDateTime('{end_time}')") s = s[0][0] client.disconnect() else: s = 0 if not s: sql = f'INSERT INTO {schema}.{table}(' \ f'end_time, invertal, gtid, transaction_spend_time, transaction_size, single_statement_affected_rows) VALUES' client = Client(host=host, port=port, user=CH_USER, password=CH_PASS) client.execute(sql, res) logger.info(f"Success: {schema}.{table} {start_time}, {end_time}") client.disconnect() else: logger.info(f'Exists, pass: {schema}.{table} {start_time}, {end_time}')
def clickhouse(): client = Client( settings.CLICKHOUSE_HOST, user=settings.CLICKHOUSE_USER, password=settings.CLICKHOUSE_PASSWORD, ) client.execute(f'DROP DATABASE IF EXISTS {settings.CLICKHOUSE_NAME}') client.execute(f'CREATE DATABASE {settings.CLICKHOUSE_NAME}') client.disconnect()
def insert_partition(rows): assert isinstance(rows, list), rows assert isinstance(rows[0], tuple), rows[0] assert len(rows[0]) == 2, rows[0] client = Client('localhost') try: client.execute('INSERT INTO hashes (data, hash) VALUES', rows) except Exception: client.disconnect()
class ClientWrapper: """Context manager to allow use of ClickHouse connections in with clause""" def __init__(self, *args, **kwargs): self.client = Client(*args, **kwargs) def __enter__(self): return self.client def __exit__(self, *args): """Disconnect from server to up free socket""" self.client.disconnect()
def check_clickhouse_status(self): sql = "show databases" try: client = Client(**CLICKHOUSE_DB_INFO) client.execute(sql) client.disconnect() messages = "clickhouse status is ok" logger.info(messages) return True except Exception as error: messages = "clickhouse can not connection:%s" % (str(error)) logger.error(messages) return False
def get_ch_column_type(self, db, table): """ :returns like : {'id': 'UInt32', 'report_id': 'Int32', 'cost_type': 'Int16', 'cost_name': 'String', 'amount': 'Int32', 'actual_amount': 'Int32'} """ sql = "select name,type from system.columns where database='{0}' and table='{1}'".format( db, table) try: client = Client(**CLICKHOUSE_DB_INFO) ret = client.execute(sql) client.disconnect() return {d[0]: d[1] for d in ret} except Exception as error: messages = "获取clickhouse里面的字段类型错误. %s" % (str(error)) logger.error(messages) alarm(alarm_cnf, title=self.alarm_title, messages=messages) exit(1)
def test_insert_block_size(self): client = Client(self.host, self.port, self.database, self.user, self.password, settings={'insert_block_size': 1}) with self.create_table('a UInt8'): data = [(x, ) for x in range(4)] client.execute('INSERT INTO test (a) VALUES', data) query = 'SELECT * FROM test' inserted = self.emit_cli(query) self.assertEqual(inserted, '0\n1\n2\n3\n') inserted = client.execute(query) self.assertEqual(inserted, data) client.disconnect()
def dbms_sync_clickhouse_schema(row): ignored_schemas = ('_temporary_and_external_tables', 'system', 'default') query = f"select name from system.databases where name not in {ignored_schemas}" config = { 'host': row.host, 'port': row.port, 'database': 'default', 'connect_timeout': 5, 'send_receive_timeout': 5, } # 请在clickhouse创建好用户 config.update(REOMOTE_USER) cnx = Client(**config) result = cnx.execute(query) for i in result: schema = i[0] models.DbSchemas.objects.update_or_create(cid_id=row.id, schema=schema, defaults={'schema': schema}) cnx.disconnect()
def check_table_exists_on_clickhouse(self): """ 检查mysql需要同步的表在clickhouse里面是否存在 :return: """ client = Client(**CLICKHOUSE_DB_INFO) not_in_clickhouse_tables = [] if not self.rsnyc_tables: messages = "no tables to sync " logger.error(messages) return False else: try: for i in self.rsnyc_tables: table_schema = i['table_schema'] table_name = i['table_name'] check_sql = """select count(*) countnum from system.tables where database= '%s' and name='%s';""" % ( table_schema, table_name) countnum = client.execute(check_sql) if countnum[0][0] == 0: not_in_clickhouse_tables.append(table_schema + '.' + table_name) except Exception as error: messages = "clickhouse failed: %s" % (str(error)) logger.error(messages) finally: if client: client.disconnect() if not_in_clickhouse_tables: messages = "tables: %s not in clickhouse" % ( ','.join(not_in_clickhouse_tables)) logger.error(messages) return False else: messages = "tables all in clickhouse" logger.info(messages) return True
def get_most_time_consuming_transaction(start_datetime, end_datetime, prefix): start = start_datetime.strftime('%Y-%m-%d %H:%M:%S') end = end_datetime.strftime('%Y-%m-%d %H:%M:%S') sql = f''' SELECT toDateTime('{end}'), '{str(interval)}', gtid, max(execute_time) - min(execute_time) AS transaction_spend_time, (max(toUInt32(binlog_pos)) - min(toUInt32(binlog_pos))) + argMax(toUInt32(single_statement_size), toUInt32(binlog_pos)) AS transaction_size, sum(single_statement_affected_rows) AS single_statement_affected_rows FROM mysql_monitor.{prefix}_binlog WHERE (execute_time >= '{start}') AND (execute_time < '{end}') GROUP BY gtid ORDER BY transaction_spend_time DESC LIMIT 1 ''' ind = random.randint(0, len(CH_SERVERS) - 1) host, port = CH_SERVERS[ind][0], CH_SERVERS[ind][1] client = Client(host=host, port=port, user=CH_USER, password=CH_PASS, settings=SETTINGS) res = client.execute(sql) client.disconnect() return res
class ClickhouseClient: """ clickhouse封装 """ def __init__(self, host: str, port: int, database: str, user: str, password: str) -> None: self.conn = Client(host=host, port=port, database=database, user=user, password=password) def close(self): self.conn.disconnect() def query(self, query: str, params: Any = None) -> List[dict]: """ 查询一条结果 :param query: 查询语句 :param params: 查询参数 :return: List[dict] """ try: data = self.conn.execute_iter(query, params, with_column_types=True) columns = [column[0] for column in next(data)] temp = [] for row in data: temp.append( json.dumps(dict(zip(columns, [value for value in row])), cls=DateEncoder)) except Exception as e: raise e finally: self.close() return temp
def insert_data(data: list): client = Client('localhost') client.execute("Insert into test.php_local (col1, col2, col3) VALUES ", data) client.disconnect()
class GeneralSocialDemoRatingAdmixer(generics.ListAPIView, ParamsHandler): renderer_classes = (r.BrowsableAPIRenderer, r.JSONRenderer, AdmixerRenderer) serializer_class = SocialDemoRatingAdmixerSerializer permission_classes = (permissions.IsAuthenticatedOrReadOnly, IsRequestsToSocialDemoAllow) pagination_class = ConfiguredPageNumberPagination admixer_values_list = ('platform', 'browser', 'region', 'age', 'gender', 'income', 'uniques', 'views') def _chunks(self, l, n): return [l[i:i + n] for i in range(0, len(l), n)] def _convert(self, tup): di = {} for a, b in tup: di.setdefault(a, []).append(b) return di def _query_admixer_data(self, batch_ids, start_date, end_date): ids = ",".join("'%s'" % item for item in batch_ids) query = 'select UrlId, Platform, Browser, Country, Age, Gender, Income, count(distinct IntVisKey), Sum(Views)' \ 'from admixer.UrlStat ' \ 'where UrlId in (%s) and Date >= \'%s\' and Date <= \'%s\' ' \ 'Group by UrlId, Platform, Browser, Country, Age, Gender, Income' % (ids, start_date, end_date) response = self._client.execute(query) results = dict( zip(self.admixer_values_list, [[], [], [], [], [], [], 0, 0])) for row in response: item = dict(zip(self.admixer_values_list, row[1:])) for key in self.admixer_values_list[:-2]: results[key].append(item[key]) for key in self.admixer_values_list[len(self.admixer_values_list) - 2:]: results[key] += (item[key]) logger.info("Received %d records from ClickHouse", len(results)) return results def list(self, request, *args, **kwargs): params = self.handle_request_params(request) aggregator = params.pop("aggregator", None) if not aggregator: return Response() try: end_date = params.pop("posted_date__lte") start_date = params.pop("posted_date__gte") except KeyError as e: end_date = datetime.strptime(settings.DEFAULT_TO_DATE, "%Y-%m-%d") start_date = datetime.strptime(settings.DEFAULT_FROM_DATE, "%Y-%m-%d") publications = Publication.objects.filter(**params).values_list( aggregator, "shukachpublication__shukach_id") l_part = self._convert(publications) self._client = Client(settings.CLICKHOUSE_HOST, database=settings.CLICKHOUSE_DB, user=settings.CLICKHOUSE_USER, password=settings.CLICKHOUSE_PASSWORD) total = len(publications) current = 0 queryset = [] for agg, ids in l_part.items(): results = dict( zip(self.admixer_values_list, [[], [], [], [], [], [], 0, 0])) for batch_ids in self._chunks(ids, 10000): logger.info("Sent %d ids" % len(batch_ids)) item = self._query_admixer_data(batch_ids, start_date, end_date) for key in self.admixer_values_list: results[key] += item[key] current += len(batch_ids) logger.info("Processed: %d/%d" % (current, total)) queryset.append({ "aggregator": agg, "views": results["views"], "platforms": dict(Counter(results["platform"])), "browsers": dict(Counter(results["browser"])), "regions": dict(Counter(results["region"])), "age_groups": dict(Counter(results["age"])), "gender_groups": dict(Counter(results["gender"])), "income_groups": dict(Counter(results["income"])), "uniques": results["uniques"] }) self._client.disconnect() page = self.paginate_queryset(queryset) if page is not None: return self.get_paginated_response(queryset) return Response(queryset)
if write_to_db == 0: print(jsonrow) if counter == 1: kafka_df = pd.DataFrame({'Message' : jsonrow },index=np.arange(1)) else: kafka_df.append(pd.DataFrame({'Message' : jsonrow },index=np.arange(1))) if (counter % mod_factor) == 0: print('Prnting Dataframe') print(kafka_df) if write_to_db == 0: sys.exit() if write_to_db == 1: try: clk_haus_client = Client('localhost',settings=clk_settings,connect_timeout=60,send_receive_timeout=900,sync_request_timeout=120) clk_haus_client.execute(insert_clause,kafka_df.to_dict('r')) clk_haus_client.disconnect() except Exception as e: print(str(e)) pass kafka_array = [] kafka_df = pd.DataFrame() counter = 0 counter = counter + 1 except Exception as e: print("Outer Exception") print(str(e)) sys.exit() pass except KeyboardInterrupt: sys.exit()
class ClickHouse(object): """ ClickHouse is a facade pattern class based on clickhouse-driver python API for ClickHouse DBMS It defines at a higher-level useful commands and adds to this API tracing/debug functionality and improved output format with Pandas dataframes. """ def __init__(self, host, port, user, password, database, trace=0): self._client = 'ClickHouse' self._host = host self._port = port self._user = user self._password = password self._database = database self._api = None self._trace = trace self._last_query = None # clickhouser-driver last query execution statistics variables (self._lastquery_id, self._resultset_rows, self._elapsed, self._processed_rows, self._processed_bytes, self._total_rows) = [None, None, None, None, None, None] if host == 'localhost': self._host = '127.0.0.1' # Create connection with clickhouse driver API try: self._api = Client(host=host, database=database, port=port, user=user, password=password) self._api.execute('SHOW TABLES') except Exception: raise DBConnectionFailed( f'Connection to ClickHouse failed. Check connection parameters' ) @property def last_query(self): return self._last_query @property def last_query_stats(self): return [ self._lastquery_id, self._resultset_rows, self._elapsed, self._processed_rows, self._processed_bytes, self._total_rows ] def sql(self, sql='', cols=None, index=None, split=True, auto=False, params=None, columnar=False, qid=None, execute=True): """ This method is calling clickhouse-driver execute() method to execute sql query Connection has already been established. :param sql: clickhouse SQL query string that will be send to server :param cols: pandas dataframe columns :param index: pandas dataframe columns :param split: either split the columns string argument or leave it :param auto: if True, it will try to extract the columns from SQL SELECT , , , FROM and pass them to `cols` :param params: clickhouse-client executeparameters :param columnar: if specified the result will be returned in column-oriented form. Defaults row-like form. :param qid: query identifier. If no query id specified ClickHouse server will generate it :param execute: execute SQL commands only if execute=True :return: pandas dataframe """ # Initialization stage tuples = () self._last_query = sql self._lastquery_id = qid (self._elapsed, self._resultset_rows, self._processed_rows, self._processed_bytes, self._total_rows) = [0, 0, 0, 0, 0] # clickhouse-driver execution of sql statement # ToDO: 1. choose the ouput format, i.e. display tuples, pandas dataframe, dictionary, etc... # # ToDO: 2. paging with a generator e.g. gen = (row for row in cql.execute('SELECT * from FloatOnSSD_SRC') # ToDo: 2. and clickhouse-driver streaming results, i.e. execute_iter command if execute: tuples = self._api.execute(query=sql, params=params, columnar=columnar, query_id=qid) self._elapsed = self._api.last_query.elapsed # Avoid AttributeError: 'NoneType' object has no attribute 'rows' in clickhouse-driver try: self._resultset_rows = self._api.last_query.profile_info.rows self._processed_rows = self._api.last_query.progress.rows self._processed_bytes = self._api.last_query.progress.bytes self._total_rows = self._api.last_query.progress.total_rows except AttributeError: pass # Transform tuples to pandas dataframe # Start measuring elapsed time for transforming python tuples to pandas dataframe t_start = time.perf_counter() if auto: pos1 = sql.find('SELECT') + 7 pos2 = sql.find('FROM', pos1) cols = sql[pos1:pos2 - 1] if cols and split: cols = cols.split(', ') if index and split: index = index.split(', ') try: result = ETL.get_dataframe(tuples, cols, index) except Exception: print(sql) raise PandasError( f'Failed to construct Pandas dataframe, check query and parameters' ) # End measuring elapsed time for pandas dataframe transformation t_end = time.perf_counter() # Debug info if self._trace > 2: print( f'QueryID:{qid}\nLatency for pandas dataframe transformation : {round(t_end-t_start, 3)} sec.' ) if self._trace > 1: print( f'{self._last_query}\n╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌' ) if self._trace > 0: lqs = self.last_query_stats print( f'QueryID:{lqs[0]}\nElapsed: {round(lqs[2], 3)} sec', f'{lqs[1]} rows in set.\nProcessed: {lqs[3]} rows, {round(lqs[4]/1048576, 3)} MB', '\n___________________________________________________________________________' ) if result.empty: if self._trace > 0: print( 'Done.\n╚═══════════════════════════════════════════════════════════════════════╝' ) return None else: return result def cmd(self, cmd, dbhost=None, dbport=None, dbuser=None, dbpassword=None, db=None, table=None, engine=None, partkey=None, skey=None, settings=None, aggr=False, group_by=None, heading=None, fields=None, projection='*', where=None, hb2=None, source=None, ha2=None, fullpath=None, sql=None, active=True, limit=None, execute=True): """ Basically this is a wrapper method that constructs sql statements, `sql` method executes these statements :param cmd: command controls the type of result to return. Default is to return the result set from query :param dbhost: mysql host :param dbport: mysql port :param dbuser: mysql database user :param dbpassword: mysql database user password :param db: database name :param table: table name :param engine: the type of clickhouse engine :param partkey: partition key :param skey: sorting key :param settings: clickhouse engine settings :param aggr: aggregate results :param group_by: SQL GROUP BY construct :param heading: list of field names paired with clickhouse data types ( 'fld1_name dtype1', 'fld2_name dtype2'... ) :param fields: list of field names, used in pandas dataframe and in insert command :param projection: list of columns from a table, used in select command :param where: SQL WHERE construct in select command :param source: clickhouse file format, mysql, odbc, etc... :param fullpath: fullpath of the flat file used to read data from (relative to ClickHouse user_files_path) :param sql: SQL query :param active: select only active parts :param hb2: select parts with a specific hb2 dimension (hb2 is the dim2 of the Entity/ASET key) default hb2='%' :param ha2: attribute dimension that is used in insert command (ImportedDataResourceWithRightJoin) :param limit: SQL limit :param execute: Execute the command only if execute=True Each one of the following commands takes specific paramaters, `execute` is common for all of them: `tables` : db, engine, table, group_by `columns` : db, table, aggr `parts` : db, table, hb2, active `optimize` : table `mutations` : table, limit `query_log` : `create` : table, heading, engine, partkey, skey, settings `select` : dbhost, dbport, dbuser, dbpassword, db, table, source, fullpath, heading, fields, where, projection, limit `insert` : table, fields, source, ha2 :return: query result set in a pandas dataframe """ # Initialize local variables # # ------------------------------------------------------------------------------------ # check cmd argument # ------------------------------------------------------------------------------------ if cmd not in cmd_types: raise InvalidCmdOperation( f'Invalid command operation: failed with parameter cmd={cmd}') # --------------------------------------------------------------------------------------- # parsing cmd argument # --------------------------------------------------------------------------------------- if cmd == 'parts' and table is not None and db is not None: activepart = None # return information about parts of MergeTree tables if active is None: activepart = None elif active: activepart = 1 elif not active: activepart = 0 sel = f'\nSELECT table, database, partition_id, name, active, marks, rows,' sel += '\n min_block_number as min_blk, max_block_number AS max_blk,' sel += '\n level, toDecimal32(primary_key_bytes_in_memory/1024, 3) AS pk_mem' frm = 'FROM system.parts' if hb2 is None: wh = f'''WHERE database='{db}' AND table = '{table}' ''' else: wh = f'''WHERE database='{db}' AND table = '{table}' AND partition_id LIKE '{hb2}-%' ''' if active is not None: wh += f''' AND active = {activepart}''' ordname = f'ORDER BY name' # construct query query = sql_construct(select=sel, frm=frm, where=wh, order=ordname) # execute SQL query cols = 'table, db, PID, name, active, marks, rows, min_blk, max_blk, level, pk_mem (KB)' return self.sql(query, cols, index='PID', qid='Parts Command', execute=execute) elif cmd == 'mutations' and table is not None: # mutations allows changing or deleting lots of rows in a table # return information about mutations of MergeTree tables and their progress sel = f'\nSELECT table, command, create_time, block_numbers.number AS blk, parts_to_do AS parts, is_done,' sel += '\n latest_failed_part AS failed_at, latest_fail_time AS failed_time' frm = f'FROM system.mutations' wh = f'WHERE table = \'{table}\'' ordtime = f'ORDER BY create_time DESC' if limit: lim = f'LIMIT {limit}' else: lim = None if group_by: grp = f'GROUP BY {group_by}' else: grp = None # construct query query = sql_construct(select=sel, frm=frm, where=wh, group_by=grp, order=ordtime, limit=lim) # execute SQL query cols = 'table, command, created_at, blk, parts, is_done, failed_at, failed_time' self.sql('system flush logs', qid='flush logs', execute=execute) return self.sql(query, cols, qid='Mutation Information Command', execute=execute) elif cmd == 'optimize' and table is not None: # construct query query = f'OPTIMIZE TABLE {table} FINAL' # execute SQL query return self.sql(query, qid='Optimize Engine Command', execute=execute) elif cmd == 'query_log': # metadata for queries logged in the ClickHouse table with log_queries=1 setting sel = f'SELECT query_id AS id, user, client_hostname as host, client_name as client,' sel += '\n result_rows AS in_set, toDecimal32(query_duration_ms / 1000, 3) AS sec,' sel += '\n toDecimal32(memory_usage/1048576, 3) AS MEM_MB,' sel += '\n read_rows as R_Rows, toDecimal32(read_bytes / 1048576, 3) AS R_MB,' sel += '\n written_rows AS W_Rows, toDecimal32(written_bytes/1048576, 3) AS W_MB, query' frm = f'FROM system.query_log' wh = f'WHERE (type = 2) AND (query NOT LIKE \'%query_duration_ms%\')' ordtime = f'ORDER BY event_time DESC' # construct query query = sql_construct(select=sel, frm=frm, where=wh, order=ordtime) # execute SQL query cols = 'id, user, host, client, in_set, sec, MEM_MB, R_Rows, R_MB, W_Rows, W_MB, query' self.sql('system flush logs', qid='flush logs command', execute=execute) return self.sql(query, cols, qid='Query Log Command', index='id', execute=execute) elif cmd == 'tables': # Contains metadata of each table that the server knows about. Detached tables are not shown sel = f'SELECT database as db, engine, name as table, ' sel += '\n partition_key as partkey, sorting_key as skey, primary_key as pkey' frm = f'FROM system.tables' wh = f'WHERE db=\'{db}\' ' if table: wh += f'AND table like \'%{table}%\'' if engine: wh += f'AND engine=\'{engine}\'' ordengine = f'ORDER BY db, engine' # construct query query = sql_construct(select=sel, frm=frm, where=wh, order=ordengine) if group_by: sel = f'SELECT {group_by}, groupArray(table) AS tables' frm = f'FROM ( \n{query} )' grp = f'GROUP BY {group_by}' query = sql_construct(select=sel, frm=frm, group_by=grp) cols = 'engine, tables' else: cols = 'db, engine, table, partkey, skey, pkey' # execute SQL query return self.sql(query, cols, qid='Table Engines Metadata Command', execute=execute) elif cmd == 'columns': # information about the columns in a table. sel = f'SELECT name, comment, type,' sel += '\n toDecimal32(data_compressed_bytes/1048576, 3) as Compressed_MB,' sel += '\n toDecimal32(data_uncompressed_bytes/1048576, 3) as Uncompressed_MB,' sel += '\n toDecimal32(marks_bytes/1024, 3) as marks_KB' frm = 'FROM system.columns' wh = f'WHERE database=\'{db}\' AND table=\'{table}\'' ordtype = f'ORDER BY type DESC, Compressed_MB DESC' # construct query query = sql_construct(select=sel, frm=frm, where=wh, order=ordtype) if aggr: sel = f'SELECT any(\'{db}\') as db, any(\'{table}\') as table,' sel += f'\n sum(Compressed_MB) as total_compressed_MB,' sel += f'\n sum(Uncompressed_MB) as total_uncompressed_MB, sum(marks_KB) as total_marks_KB,' sel += f'\n argMin(name, Compressed_MB) as min_column, min(Compressed_MB) as min_MB,' sel += f'\n argMax(name, Compressed_MB) as max_column, max(Compressed_MB) as max_MB,' sel += f'\n avg(Compressed_MB) as avg_MB' frm = f'FROM ( \n{query} )' query = sql_construct(select=sel, frm=frm) cols = 'db, table, total_compressed_MB, total_uncompressed_MB, total_marks_KB, ' cols += 'min_column, min_MB, max_column, max_MB, avg_MB' else: cols = 'name, comment, type, Compressed_MB, Uncomressed_MB, marks_KB' # execute SQL query return self.sql(query, cols, qid='Table Columns Metadata Command', execute=execute) elif cmd == 'create': # Check engine passed if engine not in engine_types: raise InvalidEngine( f'Invalid TRIADB engine: failed with parameter engine={engine}' ) # join list of strings to ', \n' separated string structure = ', \n'.join(heading) query = f'CREATE TABLE {table}' query += f'({structure})' query += f'\nENGINE = {engine}()' query += f'\nPARTITION BY {partkey}' query += f'\nORDER BY {skey}' query += f'\nSETTINGS {settings}' # execute SQL query self.sql(f'DROP TABLE IF EXISTS {table}', qid=f'Drop Table {table}', execute=execute) return self.sql(query, qid=f'Create Engine {engine} Command', execute=execute) elif cmd == 'select': if source not in source_types: raise InvalidSourceType( f'Invalid TRIADB source type: failed with parameter source={source}' ) query = None result = None # join list of strings to ', \n' separated string if source in ['CSVWithNames', 'TabSeparatedWithNames', 'MySQL']: sel = f'SELECT {projection}' frm = '' if source in ['CSVWithNames', 'TabSeparatedWithNames']: structure = ', '.join(heading) frm = f'FROM file(' frm += f"\n '{fullpath}'," frm += f"\n '{source}'," frm += f"\n '{structure}'" frm += f'\n)' elif source == 'MySQL': frm = f"FROM mysql('{dbhost}:{dbport}', '{db}', '{table}', '{dbuser}', '{dbpassword}')" if where: wh = f'WHERE {where}' else: wh = None if limit: lim = f'LIMIT {limit}' else: lim = None # construct query query = sql_construct(select=sel, frm=frm, where=wh, limit=lim) # execute SQL query result = self.sql(query, qid='Select rows from flat file Command', cols=fields, split=False, execute=execute) elif source == 'ImportedDataResource': structure = ', '.join(heading) sel = f'SELECT {structure}' frm = f'FROM {table} ' grp = 'GROUP BY val' hav = 'HAVING isNotNull(val)' ordval = 'ORDER BY val' # construct query query = sql_construct(select=sel, frm=frm, group_by=grp, having=hav, order=ordval) # execute SQL query result = self.sql( query, qid='Select HyperAtom AdjacencyLists Command', cols=fields, split=False, execute=execute) # if query is used in other commands use execute=false to return it if execute: return result else: return query elif cmd == 'insert': if source not in source_types: raise InvalidSourceType( f'Invalid TRIADB source type: failed with parameter source={source}' ) sqlid = None frm = None # Inserting the results of SELECT ins = f'INSERT INTO {table}' sel = f'SELECT ' + ', '.join(fields) # execute SQL query if source == 'file': # from part frm = f'FROM (\n{sql} \n)' # query id : insert from file to import data resource sqlid = 'InsertFromFile' elif source == 'ImportedDataResourceWithRightJoin': # from part frm = f'FROM {table} \nRIGHT JOIN(\n{sql} \n) AS A USING val\n WHERE ha2={ha2}' # query id : insert from imported data resource to load data on data type dictionary engines sqlid = 'InsertFromImportedDataResourceWithRightJoin' elif source == 'ImportedDataResource': # from part frm = f'FROM (\n{sql} \n)' # query id : insert from imported data resource to load data on data type dictionary engines sqlid = 'InsertFromImportedDataResource' elif source == 'TableEngine': # in that case sql parameter takes the name of the table engine # from part frm = f'FROM {sql}' # query id : insert from imported data resource to load data on data type dictionary engines sqlid = 'InsertFromTableEngine' elif source == 'DataTypeDictionary': # from part (sql variable here is the name of the DataTypeDictionary table) frm = f'FROM {sql}' # insert from DataTypeDictionary to load data onto HAtom and HLink sqlid = 'InsertFromDataTypeDictionary' # construct query query = ins + f'\n{sel}\n{frm}' return self.sql(query, qid=f'{sqlid} Command', execute=execute) else: raise InvalidCmdOperation(f'Invalid command operation') def disconnect(self): self._api.disconnect()
def create_db(db_name, db_host, db_user, db_password): client = Client(db_host, user=db_user, password=db_password) client.execute(f"CREATE DATABASE IF NOT EXISTS {db_name}") client.disconnect()
print(f'Извлечение отобранных строк таблицы {left_arc_file_name}') #Перемещение курсора по сжатой таблице к #началу каждой отвечающей запросу строки. #Очередная новая позиция курсора отсчитывается #не от начала файла, а от последней запомненной #позиции, что в ряде случаев приводит к #достижению колоссальной производительности. #Прописывание отобранных строк в конечный файл. #Присвоение флагу значения, показывающего #наличие в конечном файле нехэдерных строк. cur_pointer = 0 for line_start in res: new_pointer = line_start[0] left_arc_file_opened.seek(new_pointer - cur_pointer, 1) trg_file_opened.write( left_arc_file_opened.readline().decode('UTF-8')) cur_pointer = left_arc_file_opened.tell() empty_res = False #Если флаг-индикатор так и #остался равен True, значит, #результатов пересечения/вычитания #для данной левой таблицы нет, и в #конечный файл попали только хэдеры. #Такие конечные файлы программа удалит. if empty_res == True: os.remove(trg_file_path) client.disconnect()
def create_database(): ''' Функция создаст ClickHouse-базу данных и пополнит каждую её таблицу информацией, обеспечивающей быстрый доступ к элементам соответствующей сжатой исходной таблицы. ''' ind_dir_path = os.path.normpath( input('\nПуть к папке с индексируемыми архивами: ')) trg_dir_path = input('\nПуть к папке для результатов: ') #Имя базы данных сделаем для простоты почти #тем же, что и у папки с индексируемыми файлами. #Соединение с ClickHouse, создание клиент-объекта. db_name = f'DBCH{os.path.basename(ind_dir_path)}' client = Client('localhost') #Проверка на наличие базы, #созданной по тем же данным #при прошлых запусках программы. #Если предыдущая БД обнаружилась, то #выведудся имена хранимых там таблиц #и столбцов, а также типы данных. if (f'{db_name}', ) in client.execute('SHOW DATABASES'): print(f'\nБаза данных {db_name} уже существует') client.execute(f'USE {db_name}') tab_names = [ tup[0] for tup in client.execute('SHOW TABLES') if tup[0] != 'header' ] table_struc = client.execute_iter(f'DESCRIBE TABLE {tab_names[0]}') col_names_n_types = { tup[0]: tup[1] for tup in table_struc if tup[0] != 'line_start' } print('\nТаблицы ранее созданной базы данных:\n', tab_names) print( '\nСтолбцы таблиц и соответствующие типы данных ранее созданной БД:\n', col_names_n_types) #Раз БД, соответствующая таблицам #выбранной папки, ранее была создана, #то можно сразу приступать к её #эксплуатации с помощью фронтенда. #Иной вариант - создать базу заново, #чтобы, например, переиндексировать #эти таблицы по другим столбцам. recreate = input('''\nПересоздать базу данных? [yes(|y)|no(|n|<enter>)]: ''') if recreate in ['yes', 'y']: client.execute(f'DROP DATABASE {db_name}') elif recreate in ['no', 'n', '']: return ind_dir_path, trg_dir_path, db_name, tab_names, col_names_n_types else: print(f'{recreate} - недопустимая опция') sys.exit() ram = int(input('\nОбъём оперативной памяти компьютера, Гбайт: ')) * 1e9 detect_headers = input( '''\nРаспознавать хэдеры VCF (##) и UCSC (track_name=) индексируемых таблиц автоматически, или потом вы укажете количество хэдеров самостоятельно? (Предпросмотрщик больших файлов есть в репозитории https://github.com/PlatonB/bioinformatic-python-scripts) [auto(|a)|manual(|m)]: ''') if detect_headers in ['auto', 'a']: num_of_unind = None elif detect_headers in ['manual', 'm']: num_of_unind = input('''\nКоличество не обрабатываемых строк в начале каждой индексируемой таблицы (Важно! Табулированную шапку к ним не причисляйте) (игнорирование ввода ==> производить работу для всех строк) [0(|<enter>)|1|2|...]: ''') if num_of_unind == '': num_of_unind = 0 else: num_of_unind = int(num_of_unind) else: print(f'{detect_headers} - недопустимая опция') sys.exit() cont, col_info = 'y', {} while cont not in ['no', 'n', '']: col_name = input('''\nИмя индексируемого столбца (Нужно соблюдать регистр) [#Chrom|pos|RSID|...]: ''') col_name = ''.join(col_name.split('#')) data_type = input( '''\nВ выбранном столбце - целые числа, вещественные числа или строки? (примеры вещественного числа: 0.05, 2.5e-12) (примеры строки: X, Y, A/C/G, rs11624464, HLA-DQB1) [integer(|i)|decimal(|d)|string(|s)]: ''') if data_type in ['integer', 'i']: data_type = 'Int64' tale = None elif data_type in ['decimal', 'd']: tale = input('''\nСколько оставлять знаков после точки? (игнорирование ввода ==> 5) [...|5(|<enter>)|...|18): ''') if tale == '': tale = '5' elif 0 > int(tale) > 18: print(f'{tale} - недопустимая опция') sys.exit() data_type = f'Decimal64({tale})' elif data_type in ['string', 's']: data_type = 'String' tale = None else: print(f'{data_type} - недопустимая опция') sys.exit() col_info[col_name] = [data_type, 'cell_index', tale] cont = input('''\nПроиндексировать по ещё одному столбцу? (игнорирование ввода ==> нет) [yes(|y)|no(|n|<enter>)]: ''') if cont not in ['yes', 'y', 'no', 'n', '']: print('{cont} - недопустимая опция') sys.exit() #Доукомплектовываем созданный в рамках #пользовательского диалога словарь с названиями #и характеристиками выбранных пользователем #столбцов парой ключ-значение, описывающей #столбец индексов архивированной таблицы. col_info['line_start'] = ['Int64'] #Получаем названия указанных пользователем #столбцов и столбца с индексами сжатой таблицы. col_names = list(col_info.keys()) #ClickHouse не индексирует, а просто сортирует столбцы. #Выделим для сортировки половину оперативной памяти. #Если этого объёма не хватит, то ClickHouse задействует #внешнюю сортировку - размещение фрагментов столбца на #диске, сортировку каждого из них и поэтапное слияние. client.execute(f'SET max_bytes_before_external_sort = {int(ram) // 2}') #Создание БД, и выбор этой БД для использования #во всех последующих запросах по умолчанию. client.execute(f'CREATE DATABASE {db_name}') client.execute(f'USE {db_name}') print('') #Работа с архивами, каждый из #которых содержит по одной таблице. arc_file_names = os.listdir(ind_dir_path) for arc_file_name in arc_file_names: if arc_file_name.startswith('.~lock.'): continue with gzip.open(os.path.join(ind_dir_path, arc_file_name), mode='rt') as arc_file_opened: #Автоматическое определение и прочтение #вхолостую хэдеров таблиц распространённых #биоинформатических форматов VCF и UCSC BED. #Последний из прочитанных хэдеров (он #же - шапка таблицы) будет сохранён. if num_of_unind == None: while True: header_row = process_line(arc_file_opened) if re.match(r'##|track_name=', header_row[0]) == None: break #Холостое прочтение хэдеров, количество которых #указано пользователем, и сохранение шапки. else: for unind_index in range(num_of_unind): arc_file_opened.readline() header_row = process_line(arc_file_opened) #Обязательное требование программы - #единообразие исходных таблиц. #Доказательством соблюдения этого правила #будет считаться одинаковость шапок. #Шапка первой обрабатываемой таблицы #назначается референсной, а шапки #следующих таблиц будут с ней сопоставляться. if 'common_header_row' not in locals(): common_header_row = copy.deepcopy(header_row) elif header_row != common_header_row: print('Шапки индексируемых таблиц не совпадают') sys.exit() #Элементы шапки, озаглавливающие #выбранные пользователем столбцы, #станут потом именами столбцов БД. #Поскольку в этих именах не должно #быть символа # (таковы требования #со стороны ClickHouse), убираем его. for header_cell_index in range(len(header_row)): if header_row[header_cell_index].find('#') != -1: header_row[header_cell_index] = ''.join( header_row[header_cell_index].split('#')) #На этапе пользовательского диалога был #создан словарь с указанными пользователем #именами будущих столбцов БД и соответствующими #поддерживаемыми ClickHouse типами данных. #Добавляем ко всем ключам словаря, #кроме отвечающего за столбец стартов #строк, индексы имён этих столбцов, #обозначающие их позицию в шапке. #Эти же индексы будут определять #положение соответствующих ячеек в #каждой строке исходной таблицы. for col_name in col_names[:-1]: col_info[col_name][1] = header_row.index(col_name) #Для простоты назовём таблицы БД теми же #именами, что и у исходных, но только без #точек и дефисов, т.к. наличие таковых в #именах таблиц ClickHouse-баз недопустимо. #Таблицам также нельзя присваивать имена, #начинающиеся с цифры, поэтому добавим #каждому имени буквенную приставку. tab_name = 'TBL' + arc_file_name.replace('.', 'DOT').replace( '-', 'DEFIS') #Создаём таблицу БД, которая после #дальнейшего заполнения будет служить #путеводителем по соответствующей #gzip-архивированной крупной таблице. #Имя и тип данных каждого столбца БД #берём из ранее сформированного словаря. #По умолчанию ClickHouse сжимает каждый #столбец очень быстрым, но практически #не уменьшающим размер алгоритмом LZ4. #Применем к столбцам оптимальный по #скорости и степени компрессии Zstandart. client.execute(f'''CREATE TABLE {tab_name} ({", ".join([col_name + " " + col_ann[0] + " CODEC(ZSTD(22))" for col_name, col_ann in col_info.items()])}) ENGINE = MergeTree() ORDER BY ({", ".join(col_names[:-1])})''' ) print(f'Таблица {tab_name} новой базы данных пополняется') #Данные будут поступать в #базу одной или более порциями. #Для контроля работы с порциями #далее будет отмеряться их размер. #Назначаем ноль в качестве #стартового значения этой величины. fragment, fragment_len = [], 0 #Таблица БД будет пополняться #до тех пор, пока не закончится #перебор строк исходной таблицы. while True: #Размер порции в 100000 строк #соответствует рекомендации из #официальной документации ClickHouse. if fragment_len == 100000: client.execute( f'''INSERT INTO {tab_name} ({", ".join(col_names)}) VALUES''', fragment) #После добавления порции список, #её содержащий, очищается, а #счётчик её размера обнуляется. fragment.clear() fragment_len = 0 #Получение байтовой позиции начала #текущей строки исходной таблицы. #Устранение \n и разбиение #этой строки на список. line_start, row = arc_file_opened.tell(), process_line( arc_file_opened) #Чтение исходной таблицы завершено. #Вероятнее всего, количество строк #таблицы не кратно 100000, поэтому #к этому моменту накопилось ещё #некоторое количество данных. #Допропишем тогда их в базу. if row == ['']: if fragment_len > 0: client.execute( f'''INSERT INTO {tab_name} ({", ".join(col_names)}) VALUES''', fragment) break #Отбор ячеек тех столбцов сжатой #таблицы, по которым индексируем. #Сохранение этих ячеек и стартовых #позиций табличных строк в список. cells = fetch_cells(row, col_info, line_start) #Пополнение порции с нуля, в т.ч. #после отправки в БД предыдущей, #либо достройка текущей порции. fragment.append(dict(zip(col_names, cells))) #В любом случае, инкрементируем #счётчик размера порции. fragment_len += 1 #Соберём информацию об устройстве базы данных. #Она будет далее использоваться фронтендами. #Выведем также эти сведения на экран, #чтобы пользователю при запусках фронтендов #было очень легко в базе разобраться. tab_names = [tup[0] for tup in client.execute('SHOW TABLES')] col_names_n_types = { col_name: col_ann[0] for col_name, col_ann in col_info.items() if col_name != 'line_start' } print('\nТаблицы новой базы данных:\n', tab_names) print('\nСтолбцы таблиц и соответствующие типы данных новой БД:\n', col_names_n_types) #Общая для всех исходных таблиц шапка #направится в отдельную таблицу БД. client.execute('''CREATE TABLE header (header_cells String) ENGINE = TinyLog''') client.execute( f'''INSERT INTO header (header_cells) VALUES''', [{ 'header_cells': header_cell } for header_cell in common_header_row]) client.disconnect() return ind_dir_path, trg_dir_path, db_name, tab_names, col_names_n_types
class ClickHouse: def __init__(self): self.client = None def __enter__(self): self.client = Client( settings.CLICKHOUSE_HOST, user=settings.CLICKHOUSE_USER, password=settings.CLICKHOUSE_PASSWORD, database=settings.CLICKHOUSE_NAME, ) return self def __exit__(self, exc_type, exc_value, traceback): self.client.disconnect() def execute(self, query, *args, **kwargs): echo = kwargs.pop('echo', None) if echo is not None: print(query) return self.client.execute(query, *args, **kwargs) def insert_rows(self, table, fields, rows): n = len(rows) if n > 0: fields = list(fields) values = [] for row in rows: values.append(tuple(row[name] for name in fields)) columns = ', '.join(f'"{name}"' for name in fields) self.execute( f'INSERT INTO {table} ({columns}) VALUES', values, types_check=True, #echo=True, ) return n def upload(self, name, metadata, fields, rows): rows2 = [] for time, data in rows: data = data.copy() data['TIMESTAMP'] = int(time.timestamp()) rows2.append(data) rows = rows2 # Guess the table name table_name = metadata.get('table_name') if table_name: table = f"{name}_{table_name}" else: table = str(name) # Create the table if it does not exist # The Replacing engine allows to avoid duplicates. Deduplication is # done in the background, so there may be duplicates until the parts # are merged. self.execute( f"CREATE TABLE IF NOT EXISTS {table} ({get_column('TIMESTAMP')}) " f"ENGINE = ReplacingMergeTree() ORDER BY TIMESTAMP", #echo=True, ) # Get the table columns database = settings.CLICKHOUSE_NAME cols = set([name for name, in self.execute( f"SELECT name FROM system.columns " f"WHERE database = '{database}' AND table = '{table}';", #echo=True, )]) # Add new columns fields = set(fields) new = fields - cols if new: actions = ', '.join(f'ADD COLUMN {get_column(name)}' for name in new) self.execute(f'ALTER TABLE {table} {actions};') self.insert_rows(table, fields, rows) return metadata def select(self, table, columns=None, where=None, group_by=None, order_by=None, limit=None, limit_by=None, **kwargs): if not columns: columns = '*' else: assert type(columns) is list columns = ', '.join(columns) query = [f'SELECT {columns} FROM {table}'] clauses = [ ('WHERE {}', where), ('GROUP BY {}', group_by), ('ORDER BY {}', order_by), ('LIMIT {0[0]} BY {0[1]}', limit_by), ('LIMIT {}', limit), ] for fstr, value in clauses: if value not in (None, ''): query.append(fstr.format(value)) query = ' '.join(query) return self.execute(query, **kwargs)