def test(): from utils_ak.clickhouse.config import load_config from datetime import datetime CONFIG = load_config("test") print(CONFIG) import pandas as pd df = pd.DataFrame([[1, 1.0, "a", datetime.now()]], columns=["a", "b", "c", "d"]) table = "test13" ingest_table(df, CONFIG["clickhouse_url"], "datasets", table, df.columns[0], df.columns[0]) from pandahouse import read_clickhouse connection = { "host": f'http://{CONFIG["clickhouse_url"]}:8123', "database": "datasets", } df = read_clickhouse(f"SELECT * FROM datasets.{table}", index_col=None, connection=connection) print(df)
def get_snp_info(self, sr_id): query = "select chrom, snp_id, ea, ra, bp from {0}.{1} where snp_num=={2} and outlier=0 limit 1".\ format(CLICK_DB, CLICK_TABLE, sr_id) print(query) row = ph.read_clickhouse(query, connection=CLICKHOUSE_CONNECTION_PARAMS) return row
def download(connection): query = """ SELECT CAST(BuyDate AS Date) AS SalesDate, UserID, DeviceID, Rub FROM checks AS l LEFT JOIN (SELECT * FROM devices) AS r ON l.UserID=r.UserID LIMIT 100000 """ df = ph.read_clickhouse(query, connection=connection) return df
def select (self, query, output='dict', file='data.csv'): data = ph.read_clickhouse(query, connection=self.connector) if output == 'table': pass elif output == 'csv': data.to_csv(file, index=0) data = file elif output == 'dict': data = data.to_dict(orient='records') return data
def hello(): key_word = request.args.get('key_word') name = request.args.get('name') sql_sentence = sqlSentence() sql = sql_sentence.sql sql_search_by_name = sql_sentence.sql_search_by_name sql_search_by_type = sql_sentence.sql_search_by_type sql_search_by_kind = sql_sentence.sql_search_by_kind sql_search_by_reason = sql_sentence.sql_search_by_reason sql_search_by_message = sql_sentence.sql_search_by_message sql_search_by_all_deployment_related = sql_sentence.sql_search_by_all_deployment_related # 返回的结果 data = [] sql_result = [] try: if key_word: key_word = key_word.lower() sql += " where " + sql_search_by_all_deployment_related sql += " and " + sql_search_by_name sql += "'" + name + "%'" if key_word: sql += " and (" sql += sql_search_by_type + "'%" + key_word + "%'" sql += sql_search_by_kind + "'%" + key_word + "%'" sql += sql_search_by_reason + "'%" + key_word + "%'" sql += sql_search_by_message + "'%" + key_word + "%'" sql += ")" sql += ";" print(sql) df = read_clickhouse(sql, connection=connection) print(df) data = [{ titles[0]: row[0].strftime('%Y-%m-%d %H:%M:%S'), titles[1]: row[1], titles[2]: row[2] + ":" + row[5], titles[3]: row[3], titles[4]: row[4] } for row in df.values] except Exception as r: print(r) return json.dumps(data)
def pull_data(self, start, stop, conditions='', columns='*'): """ Pull data from clickhouse table to pandas.DataFrame with condition: 'BETWEEN start AND stop' Args: start : DateTime or string YYYY-MM-DD hh:mm:ss stop : DateTime or string YYYY-MM-DD hh:mm:ss conditions (str, optional) : Additioanl conditions to apply to query. Default is to apply none: '' See <https://clickhouse.tech/docs/en/sql-reference/statements/select/where/> columns (optional): Columns to include in query. Default is to select all: '*' Returns: pandas.DataFrame: Reults of query executed on clickhouse table """ # handle columns formatting if not isinstance(columns, str) or isinstance(columns, collections.Iterable): if isinstance(columns, dict): cols_string = ', '.join( [key + ' AS ' + columns[key] for key in columns]) else: cols_string = ', '.join(columns) elif isinstance(columns, str): # insert directly assuming its formatted properly cols_string = columns else: print('Unsuported type: ({})'.format(type(columns))) print('param: columns must be a string or iterable') raise ValueError query = """ SELECT {} FROM {}.{} WHERE Timestamp BETWEEN toDateTime('{}') AND toDateTime('{}') {} """.format(cols_string, self.owning_db, self.tbl_name, start, stop, conditions) return read_clickhouse(query=query, connection={ 'host': self.host_address, 'database': self.owning_db })
def get_z_correlation(rs_id1, rs_id2): query = "SELECT z_1, z_2 \ FROM (\ SELECT gwas_1.gwas_id, gwas_1.z AS z_1\ FROM {0}.{1} as gwas_1\ WHERE gwas_1.snp_num = {2} and outlier=0 \ ) ALL INNER JOIN (\ SELECT gwas_2.gwas_id, gwas_2.z AS z_2\ FROM {0}.{1} as gwas_2\ WHERE gwas_2.snp_num = {3} and outlier=0 \ ) USING gwas_id".format(CLICK_DB, CLICK_TABLE, rs_id1, rs_id2) z_df = ph.read_clickhouse(query, connection=CLICKHOUSE_CONNECTION_PARAMS) z_df_filtered = z_df.loc[(abs(z_df['z_1']) < 2) & (abs(z_df['z_2']) < 2)] z_corr_spearman = stats.spearmanr(z_df['z_1'], z_df['z_2']).correlation z_corr_less_than_2 = stats.pearsonr(z_df_filtered['z_1'], z_df_filtered['z_2'])[0] z_corr_ordinary = stats.pearsonr(z_df['z_1'], z_df['z_2'])[0] return [z_corr_ordinary, z_corr_less_than_2, z_corr_spearman]
def get_tables(host_address): """ Returns an obj that mirrors the database/tables/columns structure on the host. This is useful if you want auto complete to get database and table names. The lowest nested level: database_name.table_name -> Table property will be a Table obj made up of the owning database, table and columns metadata. Args: host_address (str): adress for clickhouse server Returns: object: dynamically created object that nests database_names.table_names.Table as properties """ df = read_clickhouse( query= 'SELECT database, table, name, type, compression_codec as codec FROM system.columns', connection={ 'host': host_address, 'database': 'system' }) db_dict = {} for db_name, df_dbs in df.groupby(['database']): tbls_dict = {} for tbl_name, df_tbls in df_dbs.groupby(['table']): cols = [] for _, row in df_tbls.iterrows(): cols.append( Column(col_name=row['name'], data_type=row['type'], codec=row['codec'])) tbls_dict[tbl_name] = Table(columns=cols, db_name=db_name, tbl_name=tbl_name, host_address=host_address) db_dict[db_name] = type('tables', (object, ), tbls_dict) return type('database', (object, ), db_dict) # from clickhouse_driver import Client # import numpy as np # client = Client(host_address) # query = 'SELECT database, table, name, type, compression_codec as codec FROM system.columns' # arr = np.array(client.execute(query)) # db_dict = {'host_address':host_address} # for db_name in np.unique(arr[:,0]): # tbls_dict = {} # tbls = np.unique(arr[arr[:,0]==db_name, 1]) # for tbl_name in tbls: # cols=[] # for col in arr[(arr[:,0]==db_name) & (arr[:,1]==tbl_name)]: # cols.append( # Column( # col_name = col[2], # data_type=col[3], # codec = col[4] # ) # ) # tbls_dict[tbl_name] = Table( # columns=cols, # db_name=db_name, # tbl_name=tbl_name # ) # db_dict[db_name] = type('tables', (object,), tbls_dict) # return type('server', (object,), db_dict)