示例#1
0
def test():
    from utils_ak.clickhouse.config import load_config
    from datetime import datetime

    CONFIG = load_config("test")
    print(CONFIG)
    import pandas as pd

    df = pd.DataFrame([[1, 1.0, "a", datetime.now()]],
                      columns=["a", "b", "c", "d"])

    table = "test13"
    ingest_table(df, CONFIG["clickhouse_url"], "datasets", table,
                 df.columns[0], df.columns[0])

    from pandahouse import read_clickhouse

    connection = {
        "host": f'http://{CONFIG["clickhouse_url"]}:8123',
        "database": "datasets",
    }
    df = read_clickhouse(f"SELECT * FROM datasets.{table}",
                         index_col=None,
                         connection=connection)
    print(df)
示例#2
0
 def get_snp_info(self, sr_id):
     query = "select chrom, snp_id, ea, ra, bp from {0}.{1} where snp_num=={2}  and  outlier=0  limit 1".\
         format(CLICK_DB, CLICK_TABLE, sr_id)
     print(query)
     row = ph.read_clickhouse(query,
                              connection=CLICKHOUSE_CONNECTION_PARAMS)
     return row
示例#3
0
def download(connection):
    query = """
    SELECT CAST(BuyDate AS Date) AS SalesDate, UserID, DeviceID, Rub FROM checks  AS l
    LEFT JOIN (SELECT * FROM devices) AS r ON l.UserID=r.UserID
    LIMIT 100000
    """
    df = ph.read_clickhouse(query, connection=connection)
    return df
示例#4
0
 def select (self, query, output='dict', file='data.csv'):
     data = ph.read_clickhouse(query, connection=self.connector)
     if output == 'table':
         pass
     elif output == 'csv':
         data.to_csv(file, index=0)
         data = file
     elif output == 'dict':
         data = data.to_dict(orient='records')
     return data
示例#5
0
def hello():
    key_word = request.args.get('key_word')
    name = request.args.get('name')
    sql_sentence = sqlSentence()
    sql = sql_sentence.sql
    sql_search_by_name = sql_sentence.sql_search_by_name
    sql_search_by_type = sql_sentence.sql_search_by_type
    sql_search_by_kind = sql_sentence.sql_search_by_kind
    sql_search_by_reason = sql_sentence.sql_search_by_reason
    sql_search_by_message = sql_sentence.sql_search_by_message
    sql_search_by_all_deployment_related = sql_sentence.sql_search_by_all_deployment_related
    # 返回的结果
    data = []
    sql_result = []
    try:
        if key_word:
            key_word = key_word.lower()
        sql += " where " + sql_search_by_all_deployment_related
        sql += " and " + sql_search_by_name
        sql += "'" + name + "%'"
        if key_word:
            sql += " and ("
            sql += sql_search_by_type + "'%" + key_word + "%'"
            sql += sql_search_by_kind + "'%" + key_word + "%'"
            sql += sql_search_by_reason + "'%" + key_word + "%'"
            sql += sql_search_by_message + "'%" + key_word + "%'"
            sql += ")"
        sql += ";"
        print(sql)

        df = read_clickhouse(sql, connection=connection)
        print(df)

        data = [{
            titles[0]: row[0].strftime('%Y-%m-%d %H:%M:%S'),
            titles[1]: row[1],
            titles[2]: row[2] + ":" + row[5],
            titles[3]: row[3],
            titles[4]: row[4]
        } for row in df.values]

    except Exception as r:
        print(r)

    return json.dumps(data)
示例#6
0
    def pull_data(self, start, stop, conditions='', columns='*'):
        """
        Pull data from clickhouse table to pandas.DataFrame with condition: 'BETWEEN start AND stop'
        
        Args:
            start : DateTime or string YYYY-MM-DD hh:mm:ss
            stop : DateTime or string YYYY-MM-DD hh:mm:ss
            conditions (str, optional) : Additioanl conditions to apply to query. Default is to apply none: ''
                See <https://clickhouse.tech/docs/en/sql-reference/statements/select/where/>
            columns (optional): Columns to include in query. Default is to select all: '*'


        Returns:
            pandas.DataFrame: Reults of query executed on clickhouse table
        """

        # handle columns formatting
        if not isinstance(columns, str) or isinstance(columns,
                                                      collections.Iterable):
            if isinstance(columns, dict):
                cols_string = ', '.join(
                    [key + ' AS ' + columns[key] for key in columns])
            else:
                cols_string = ', '.join(columns)
        elif isinstance(columns, str):
            # insert directly assuming its formatted properly
            cols_string = columns
        else:
            print('Unsuported type: ({})'.format(type(columns)))
            print('param: columns must be a string or iterable')
            raise ValueError

        query = """
            SELECT {} FROM {}.{} 
            WHERE Timestamp BETWEEN toDateTime('{}') AND toDateTime('{}') 
            {}
        """.format(cols_string, self.owning_db, self.tbl_name, start, stop,
                   conditions)

        return read_clickhouse(query=query,
                               connection={
                                   'host': self.host_address,
                                   'database': self.owning_db
                               })
示例#7
0
def get_z_correlation(rs_id1, rs_id2):
    query = "SELECT z_1, z_2  \
       FROM (\
            SELECT gwas_1.gwas_id, gwas_1.z AS z_1\
            FROM {0}.{1} as gwas_1\
            WHERE gwas_1.snp_num = {2} and  outlier=0 \
        ) ALL INNER JOIN (\
            SELECT gwas_2.gwas_id, gwas_2.z AS z_2\
            FROM {0}.{1} as gwas_2\
            WHERE gwas_2.snp_num = {3} and  outlier=0 \
        ) USING gwas_id".format(CLICK_DB, CLICK_TABLE, rs_id1, rs_id2)

    z_df = ph.read_clickhouse(query, connection=CLICKHOUSE_CONNECTION_PARAMS)
    z_df_filtered = z_df.loc[(abs(z_df['z_1']) < 2) & (abs(z_df['z_2']) < 2)]

    z_corr_spearman = stats.spearmanr(z_df['z_1'], z_df['z_2']).correlation
    z_corr_less_than_2 = stats.pearsonr(z_df_filtered['z_1'],
                                        z_df_filtered['z_2'])[0]
    z_corr_ordinary = stats.pearsonr(z_df['z_1'], z_df['z_2'])[0]

    return [z_corr_ordinary, z_corr_less_than_2, z_corr_spearman]
示例#8
0
def get_tables(host_address):
    """
    Returns an obj that mirrors the database/tables/columns structure on the host.

    This is useful if you want auto complete to get database and table names.

    The lowest nested level: 
        database_name.table_name -> Table 
    property will be a Table obj made up of the owning database, table and columns metadata.

    Args:
        host_address (str): adress for clickhouse server

    Returns: 
        object: dynamically created object that nests database_names.table_names.Table as properties
    """

    df = read_clickhouse(
        query=
        'SELECT database, table, name, type, compression_codec as codec FROM system.columns',
        connection={
            'host': host_address,
            'database': 'system'
        })

    db_dict = {}
    for db_name, df_dbs in df.groupby(['database']):
        tbls_dict = {}
        for tbl_name, df_tbls in df_dbs.groupby(['table']):
            cols = []
            for _, row in df_tbls.iterrows():
                cols.append(
                    Column(col_name=row['name'],
                           data_type=row['type'],
                           codec=row['codec']))

            tbls_dict[tbl_name] = Table(columns=cols,
                                        db_name=db_name,
                                        tbl_name=tbl_name,
                                        host_address=host_address)

        db_dict[db_name] = type('tables', (object, ), tbls_dict)

    return type('database', (object, ), db_dict)


# from clickhouse_driver import Client
# import numpy as np
# client = Client(host_address)
# query = 'SELECT database, table, name, type, compression_codec as codec FROM system.columns'
# arr = np.array(client.execute(query))

# db_dict = {'host_address':host_address}
# for db_name in np.unique(arr[:,0]):

#     tbls_dict = {}
#     tbls = np.unique(arr[arr[:,0]==db_name, 1])

#     for tbl_name in tbls:
#         cols=[]

#         for col in arr[(arr[:,0]==db_name) & (arr[:,1]==tbl_name)]:

#             cols.append(
#                 Column(
#                     col_name = col[2],
#                     data_type=col[3],
#                     codec = col[4]
#                 )
#             )

#         tbls_dict[tbl_name] = Table(
#             columns=cols,
#             db_name=db_name,
#             tbl_name=tbl_name
#         )

#     db_dict[db_name] = type('tables', (object,), tbls_dict)

# return type('server', (object,), db_dict)