def _connect(self): self._Connection = None if (self.Connector == "clickhouse-driver") or ((self.Connector == "default") and (self.DBType == "ClickHouse")): try: import clickhouse_driver if self.DSN: self._Connection = clickhouse_driver.connect( dsn=self.DSN, password=self.Pwd) else: self._Connection = clickhouse_driver.connect( user=self.User, password=self.Pwd, host=self.IPAddr, port=self.Port, database=self.DBName) except Exception as e: Msg = ( "'%s' 尝试使用 clickhouse-driver 连接(%s@%s:%d)数据库 '%s' 失败: %s" % (self.Name, self.User, self.IPAddr, self.Port, self.DBName, str(e))) self._QS_Logger.error(Msg) if self.Connector != "default": raise e else: self._Connector = "clickhouse-driver" self._PID = os.getpid() return 0
def get_connection(self, db_name=None): if self.conn: return self.conn if db_name: self.conn = connect(host=self.host, port=self.port, user=self.user, password=self.password, database=db_name, connect_timeout=10) else: self.conn = connect(host=self.host, port=self.port, user=self.user, password=self.password, connect_timeout=10) return self.conn
def get_connection(self, new_connect=False): if not new_connect: if self.connection and not self.connection.is_closed: return self.connection else: self.connection = connect(host=self.CONFIG.CK.host, password=self.CONFIG.CK.password, database=self.CONFIG.CK.database, user=self.CONFIG.CK.user) return self.connection else: return connect(host=self.CONFIG.CK.host, password=self.CONFIG.CK.password, database=self.CONFIG.CK.database, user=self.CONFIG.CK.user)
def create(self): try: import clickhouse_driver from clickhouse_driver.util.escape import escape_param except ImportError: raise ImportError("clickhouse_driver>=0.1.5 is required") return clickhouse_driver.connect(**self.config)
def database(): print("Phone number format: 375291234567") # check number format number = input("Input MSISDN here:") print("Data start format: 21-08-2020") # check data start data_start = input("Input Data start here:") print("Data finish format: 21-08-2020") data_finish = input("Input Data finish here:") con = connect(host=config.host, port=config.port, user=config.login, password=config.password) query = f"""SELECT RADIUS_ID, START_TIME, END_TIME, RULEBASE, BYTES_UPLINK, BYTES_DOWNLINK, SN_CHARGING_ACTION, P2P_PROTOCOL, SERVER_IP_ADDRESS, P2P_TLS_SNI, P2P_TLS_CNAME FROM DPI.FLOW_SHARDED PREWHERE RADIUS_ID = {number} WHERE END_TIME >= toDateTime('{data_start} 00:00:00') and END_TIME < toDateTime('{data_finish} 23:59:59')""" df = pd.read_sql_query(query, con) return df
def get_form(cls, connector: 'ClickhouseConnector', current_config): """ Method to retrieve the form with a current config For example, once the connector is set, - we are able to give suggestions for the `database` field - if `database` is set, we are able to give suggestions for the `table` field """ constraints = {} with suppress(Exception): connection = clickhouse_driver.connect( connector.get_connection_url()) # Always add the suggestions for the available databases with connection.cursor() as cursor: cursor.execute('SHOW DATABASES') res = cursor.fetchall() available_dbs = [ db_name for (db_name, ) in res if db_name != 'system' ] constraints['database'] = strlist_to_enum( 'database', available_dbs) if 'database' in current_config: cursor.execute( f"""SELECT name FROM system.tables WHERE database = '{current_config["database"]}'""" ) res = cursor.fetchall() available_tables = [table[0] for table in res] constraints['table'] = strlist_to_enum( 'table', available_tables, None) return create_model('FormSchema', **constraints, __base__=cls).schema()
def test_connect_default_params(self): connection = connect(host=self.host) cursor = connection.cursor() rv = cursor.execute('SELECT 1') self.assertIsNone(rv) self.assertEqual(cursor.fetchall(), [(1, )]) connection.close()
def create_connection(self, **kwargs): kwargs.setdefault('user', self.user) kwargs.setdefault('password', self.password) kwargs.setdefault('host', self.host) kwargs.setdefault('port', self.port) kwargs.setdefault('database', self.database) return connect(**kwargs)
def query(self, q): with clickhouse_driver.connect(host=self.host, port=self.port, database=self.database, user=self.user, password=self.password) as con: df = pd.read_sql(q, con=con) return df, self._make_colmap(df)
def get_group(slice,operationid,targetid,doc): percent = [] columns = [] user_id = [] for k in doc: doc[k] = doc[k]/100 columns.append(k) percent.append(doc[k]) num = len(columns) conn = connect(host='chi-ftabc-clickhouse-0-0.default.svc.cluster.local', port='9000', database='ftabcch', user='******', password='******') cursor = conn.cursor() cursor.execute("select distinct id from ads_user_tag_distribute where tag_id={}".format(targetid)) tag_id_num = len(cursor.fetchall()) import random list_part = [i for i in range(slice)] random.shuffle(list_part) import math n = math.ceil(num/slice) many_list_part = list_part*n list_all = [] for i in range(1,n+1): name = locals() name['list'+str(i)] = many_list_part[(i-1)*4:(i-1)*4+4] random.shuffle(name['list'+str(i)]) for x in (name['list'+str(i)]): list_all.append(x) group_number = [] name = locals() for i in range(num): if(percent[i]==0): name = locals() name['part'+str(i)] = """insert into add_plan_user_group (operation_id, group_name, group_user) select {} as operation_id, '{}' as group_name, [0] as group_user""".format(operationid,columns[i]) randomInsert(n,list_part,list_all,num,name['part'+str(i)]) group_number.append(0) else: sql = """insert into ads_plan_user_group (operation_id, group_name, group_user) select {} as operation_id, '{}' as group_name, groupArraySample({})(sample) as group_user from (select bitmapToArray(bitmapAndnot((select bitmapBuild(groupArray(id)) from (select distinct id from ads_user_tag_distribute where tag_id = {})), (select groupBitmapOrState(gu) as bgu from (select bitmapBuild(group_user) gu from ads_plan_user_group_distribute where operation_id = {}) group by {}))) as uid) array join uid as sample; """.format(operationid,columns[i],math.ceil(tag_id_num*percent[i]),targetid,operationid,operationid) print(sql) name['part'+str(i)] = sql randomInsert(n,list_part,list_all,num,name['part'+str(i)]) group_number.append(0) return group_number,columns
def idf(word_id): con = clickhouse_driver.connect("clickhouse://127.0.0.1") cur = con.cursor() cur.execute("SELECT COUNT(*) FROM documents") n_total = cur.fetchone()[0] if cur.rowcount else 0 cur.execute("SELECT * FROM idf_doc_count WHERE word_id = %(id)s", {"id": word_id}) n_word = cur.fetchone()[1] if cur.rowcount else 0 con.close() return log2((n_total - n_word + 0.5) / (n_word + 0.5)) - log2(0.5 / (n_total + 0.5))
def test_from_dsn(self): connection = connect( 'clickhouse://{user}:{password}@{host}:{port}/{database}'.format( user=self.user, password=self.password, host=self.host, port=self.port, database=self.database ) ) cursor = connection.cursor() rv = cursor.execute('SELECT 1') self.assertIsNone(rv) self.assertEqual(cursor.fetchall(), [(1, )]) connection.close()
def insert_ck(self): if self.command_list_len >= self.many: url = 'clickhouse://{}'.format(self.ckhost) try: conn = connect(url) cursor = conn.cursor() cursor.executemany( 'insert into redis_audit.redis_audit_info(source_host,source_port,destination_host,destination_port,command,cluster_name,event_date) values', self.command_list) except: print(traceback.format_exc()) self.command_list_len = 0 self.command_list = []
def randomInsert(n,list1,list2,number,sql): name = locals() if(number<=4): random = list1.pop(0) conn = connect(host='chi-ftabc-clickhouse-{}-0.default.svc.cluster.local'.format(random), port='9000', database='ftabcch', user='******', password='******') cursor = conn.cursor() cursor.execute(sql) cursor.fetchall() time.sleep(1) conn.close() cursor.close() else: random = list2.pop(0) conn = connect(host='chi-ftabc-clickhouse-{}-0.default.svc.cluster.local'.format(random), port='9000', database='ftabcch', user='******', password='******') cursor = conn.cursor() cursor.execute(sql) cursor.fetchall() time.sleep(1) conn.close() cursor.close()
def connect_clickhouse(host=None, port=None, user=None, password=None, database=None, collection=None): conn = connect(host=host, port=port, user=user, password=password, database=database) # conn = connect(host=host,user=user, password=password, database=database) return conn.cursor()
def main(): env.read_envfile() config = load_conf() ch_conn = connect(secure=True, **config['clickhouse']) cursor = ch_conn.cursor() q = f""" SELECT now(); """ q = query_preprocess(q) print(q) result = fire_query(cursor, q) print(result)
def _retrieve_data(self, data_source): connection = clickhouse_driver.connect( self.get_connection_url(database=data_source.database)) query_params = data_source.parameters or {} query = (data_source.query if data_source.query else f'select * from {data_source.table} limit 50;') df = pandas_read_sql(query, con=connection, params=query_params, adapt_params=True) connection.close() return df
def bi_idf(word_id_1, word_id_2): con = clickhouse_driver.connect("clickhouse://127.0.0.1") cur = con.cursor() cur.execute("SELECT COUNT(*) FROM documents") n_total = cur.fetchone()[0] if cur.rowcount else 0 cur.execute( "SELECT * FROM idf_doc_count_bigrams WHERE word_id_1 = %(id1)s AND word_id_2 = %(id2)s", { "id1": word_id_1, "id2": word_id_2 }) n_word = cur.fetchone()[2] if cur.rowcount else 0 con.close() return log2((n_total - n_word + 0.5) / (n_word + 0.5)) - log2(0.5 / (n_total + 0.5))
def insert(host_port): connection = clickhouse_driver.connect( host='127.0.0.1', port=host_port, database='clickhouse_db', user='******', password='******', ) cur = connection.cursor() cur.execute( """INSERT into clickhouse_db.city values (3986,'Palmdale','USA','California',116670), (3999, 'Simi Valley','USA','California',111351), (3958,'Orange','USA','California',128821) """ ) cur.close() connection.close()
def get_aircraft_geos(params, id): ch_conn = connect('clickhouse://{}:{}@{}:9000/{}'.format( params["clickhouse_user"], params["clickhouse_pw"], params["clickhouse_host"], params["clickhouse_db"])) ch_cursor = ch_conn.cursor() ch_cursor.execute( """ SELECT time_stamp, longitude_aircraft, latitude_aircraft, altitude_aircraft, serial_number_equip FROM eco_monitoring.adsb_raw_data WHERE id_track = '{}' """.format(id)) arr = ch_cursor.fetchall() if len(arr) > 0: arr = np.array(arr) return arr[arr[:, 0].argsort()][::-1] return arr
def _cnx(self, database=None): if database: self.config['database'] = database try: if self.rds_category in [1, 2]: cnx = pymysql.connect(**self.config) with cnx.cursor() as cursor: cursor.execute( 'set session group_concat_max_len=1073741824;') return cnx if self.rds_category in [3]: # clickhouse cnx = clickhouse_driver.connect(**self.config) return cnx except Exception as err: logger.error(err) return None
def clickhouseSelect(self, sql): try: qoeConnection = connect(self.qoeDbStr, port=8123) dbCursor = qoeConnection.cursor() dbCursor.execute(sql) result = dbCursor.fetchone() qoeConnection.close() fetch = str(result) bad_chars = ['(', ',', ')'] for i in bad_chars: fetch = fetch.replace(i, '') return str(fetch) except Exception as e: logging.error(f'{self.cn} Error {e}', exc_info=1) return 0 except NetworkError as ne: logging.error(f'{self.cn} Error {ne}', exc_info=1) return 1
def get_word_ids(query): con = clickhouse_driver.connect("clickhouse://127.0.0.1") ids = [] ps = PorterStemmer() for word_start, word_end in TreebankWordTokenizer().span_tokenize(query): word = query[word_start:word_end] stem = ps.stem(word) cur = con.cursor() cur.execute("SELECT id FROM words WHERE word = %(word)s", {"word": stem}) row = cur.fetchone() if row is None: print( f"Warning: Word {word} in form of {stem} not found in a database, skipping" ) else: id = row[0] ids.append(id) return ids
def _remote_cnx(self): """连接到目标数据库""" config = self.kwargs.get('config') config['database'] = self.kwargs['schema'] if self.kwargs['rds_category'] in [1, 2]: config.update({ 'max_allowed_packet': 1024 * 1024 * 1024, 'db': self.kwargs['schema'], 'read_timeout': 600, # 设置最大查询时间600s 'cursorclass': pymysql.cursors.DictCursor }) # 先注释掉,兼容低版本,后续做下版本采集进行判断 # if self.kwargs['rds_category'] in [1]: # config['init_command'] = 'set session MAX_EXECUTION_TIME=600000' cnx = pymysql.connect(**config) return cnx if self.kwargs['rds_category'] in [3]: config.pop('charset') cnx = clickhouse_driver.connect(**config) return cnx
def save_to_clickhouse_1(): conn = connect("clickhouse://192.168.0.9") @task() def log(ds): print("Should be saved in clickhouse") print(ds) @task() def save(): cursor = conn.cursor() cursor.execute(""" INSERT INTO test.testtable2 (id, `hostname`, `ip`, `random`) VALUES(0, 'somename1','192.168.0.1', '3892432'); """) cursor.close() aaa = save() log(aaa)
def get_from_clickhouse_1(): conn = connect("clickhouse://192.168.0.9") @task() def log(ds): print("Should be saved in clickhouse") print(ds) @task() def save(): cursor = conn.cursor() result = cursor.execute(""" SELECT * FROM test.testtable2 """) cursor.fetchall() print(result) cursor.close() return result aaa = save() log(aaa)
def ck_insert(self, jsons): ''' 必须先在clickhouse创建表 CREATE table mysql_audit.mysql_audit_info( source_host String, source_port UInt64, destination_host String, destination_port UInt64, user_name String, sql String, reponse_value String, execute_time Float64, response_status String, event_date DateTime) ENGINE=MergeTree() PARTITION BY toYYYYMMDD(event_date) ORDER BY (source_host, source_port, event_date) TTL event_date + INTERVAL 5 DAY SETTINGS index_granularity=8192,enable_mixed_granularity_parts=1; :param jsons: :return: ''' self.op_list.append(jsons) self.op_num += 1 if self.op_num >= self.many: try: ck_url = 'clickhouse://{}'.format(self.ckhost) conn = connect(ck_url) cursor = conn.cursor() cursor.executemany('insert into mysql_audit.mysql_audit_info(source_host,source_port,destination_host,destination_port,user_name,' 'sql,db,reponse_value,execute_time,response_status,event_date) values',self.op_list) except: print(traceback.format_exc()) self.op_num = 0 self.op_list = []
def connect_clickhouse(self): if self.dbinfo is not None: host = self.dbinfo['host'] port = self.dbinfo['port'] user = self.dbinfo['user'] password = self.dbinfo['password'] database = self.dbinfo['database'] else: env = readconfini('./conf/') host = env.get('db_ck', 'db_host') port = env.get('db_ck', 'db_port') user = env.get('db_ck', 'db_username') password = env.get('db_ck', 'db_password') database = env.get('db_ck', 'db_name') pass conn = connect(host=host, port=port, user=user, password=password, database=database) # conn = connect(host=host,user=user, password=password, database=database) return conn.cursor()
def sqlquery_sync_schemas_tables(): """ 同步远程的schema信息到表yasql_sqlquery_schemas/yasql_sqlquery_tables,查询授权使用 """ ignored_schemas = ('PERFORMANCE_SCHEMA', 'INFORMATION_SCHEMA', 'PERCONA', 'MYSQL', 'SYS', 'DM_META', 'DM_HEARTBEAT', 'DBMS_MONITOR', 'METRICS_SCHEMA', 'TIDB_BINLOG', 'TIDB_LOADER', '_TEMPORARY_AND_EXTERNAL_TABLES', 'DEFAULT', 'MYSQL_MONITOR', 'SYSTEM') mysql_query = f"SELECT " \ f"TABLE_SCHEMA," \ f"GROUP_CONCAT(TABLE_NAME) AS TABLE_NAME " \ f"FROM INFORMATION_SCHEMA.TABLES " \ f"WHERE TABLE_SCHEMA NOT IN {ignored_schemas} " \ f"GROUP BY TABLE_SCHEMA" clickhouse_query = f"SELECT " \ f"`database` AS TABLE_SCHEMA, " \ f"`tables` AS TABLE_NAME " \ f"FROM " \ f"(" \ f"SELECT " \ f"`database`, " \ f"groupArray(`name`) AS `name_array`, " \ f"arrayStringConcat(name_array, ',') AS `tables` " \ f"FROM system.tables " \ f"WHERE `database` NOT IN {ignored_schemas} GROUP BY `database`" \ f")" for row in DbConfig.objects.filter(use_type=1): try: result = [] if row.rds_category in [1, 2]: # mysql/tidb config = { 'host': row.host, 'port': row.port, 'read_timeout': 3, # socket.timeout: timed out,比如阿里的rds就很操蛋,没开白名单会hang住 } config.update(QUERY_USER) cnx = pymysql.connect(**config) cursor = cnx.cursor() cursor.execute('set session group_concat_max_len = 1073741824') cursor.execute(mysql_query) result = cursor.fetchall() cursor.close() cnx.close() if row.rds_category in [3]: # clickhouse config = { 'host': row.host, 'port': row.port, 'connect_timeout': 3, 'database': 'system', } config.update(QUERY_USER) cnx = clickhouse_driver.connect(**config) cursor = cnx.cursor() cursor.execute(clickhouse_query) result = cursor.fetchall() cursor.close() cnx.close() for i in result: obj, _ = DbQuerySchemas.objects.update_or_create( cid_id=row.id, schema=i[0], defaults={'schema': i[0]}) for t in i[1].split(','): DbQueryTables.objects.update_or_create( schema_id=obj.pk, table=t, defaults={'table': t}) except Exception as err: logger.error(f"异常主机:{row.host}") logger.error(err) continue logger.info(f'dbms_sync_dbschems任务结束...')
import os from nltk.tokenize.punkt import PunktSentenceTokenizer from nltk import TreebankWordTokenizer from nltk.stem import PorterStemmer import clickhouse_driver if __name__ == "__main__": data_path = "../data/Gutenberg/txt" files = [f for f in os.listdir(data_path) if f.endswith(".txt")] file_count = len(files) con = clickhouse_driver.connect("clickhouse://127.0.0.1") cur = con.cursor() cur.execute("DROP TABLE IF EXISTS inv_index") cur.execute("DROP TABLE IF EXISTS documents") cur.execute("DROP TABLE IF EXISTS words") cur.execute("CREATE TABLE words(id INTEGER, word TEXT) ENGINE = MergeTree() ORDER BY id") cur.execute("CREATE TABLE documents(id INTEGER, name TEXT) ENGINE = MergeTree() ORDER BY id") cur.execute("""CREATE TABLE inv_index( word_id INTEGER, document_id INTEGER, start_pos INTEGER, end_pos INTEGER) ENGINE = MergeTree() ORDER BY document_id PARTITION BY document_id """) con.commit() words_cache = {} for (idx, filename) in enumerate(files, 1): ps = PorterStemmer() print(f"""Processing file {idx} of {file_count} - "{filename}"...""") cur.executemany("INSERT INTO documents(id, name) VALUES", [(idx, filename)]) con.commit()