def execute_query(self, query_string): result_rows = [] with connect(host=self.host, port=self.port, auth_mechanism=self.auth_mech, user=self.username, password=self.password, database=self.database, timeout=self.timeout)as conn: with conn.cursor() as cur: try: print "executing query" # Execute query cur.execute(query_string) print "done executing query" # Get column names columns = cur.description # Impyla library under conda (used in PCF) does not support ARRAY data type. Therefore in order to # patch, we will treat array types as strings if 'ARRAY' not in _TTypeId_to_TColumnValue_getters: for index, val in enumerate(columns): if val[1] == 'ARRAY': cur._description[index] = (val[0], 'STRING', val[2], val[3], val[4], val[5], val[6]) # Fetch table results for row in cur: result_obj = {} for index, val in enumerate(columns): # Remove characters and dot which precedes column name for key values result_obj[re.sub(r'.*[.]', '', val[0])] = row[index] result_rows.append(result_obj) except Exception, e: return e
def get_consumer_basket(discount_card_id=_discount_card_id, table_name=_table_name, query=None): conn = connect(host='node1.allende.bigkore.com', port=21050) cursor = conn.cursor() if not query: query = "select * from test.%s where discount_card_id='%s'" % (table_name, discount_card_id) cursor.execute(query) column_names = [c[0] for c in cursor.description] column_types = [get_internal_type_from_mssql_type(c[1]) for c in cursor.description] results = cursor.fetchall() def trim_if_string(i, v): if column_types[i] == 'string': return v.strip() return v results = [[trim_if_string(i, v) for i, v in enumerate(row)] for row in results] conn.close() json_result = { 'col_names': column_names, 'col_types': column_types, 'data': results } return json.dumps(json_result)
def run_query(self, query, user): connection = None try: connection = connect(**self.configuration.to_dict()) cursor = connection.cursor() cursor.execute(query) column_names = [] columns = [] for column in cursor.description: column_name = column[COLUMN_NAME] column_names.append(column_name) columns.append({ 'name': column_name, 'friendly_name': column_name, 'type': types_map.get(column[COLUMN_TYPE], None) }) rows = [dict(zip(column_names, row)) for row in cursor] data = {'columns': columns, 'rows': rows} json_data = json.dumps(data, cls=JSONEncoder) error = None cursor.close() except DatabaseError as e: logging.exception(e) json_data = None error = e.message except RPCError as e: logging.exception(e) json_data = None error = "Metastore Error [%s]" % e.message except KeyboardInterrupt: connection.cancel() error = "Query cancelled by user." json_data = None except Exception as e: logging.exception(e) raise sys.exc_info()[1], None, sys.exc_info()[2] finally: if connection: connection.close() return json_data, error
def fetch(self, sql): if self.ds.database: conn = connect(host=self.ds.host, database=self.ds.database, port=self.ds.port) else: conn = connect(host=self.ds.host, port=self.ds.port) cur = conn.cursor() cur.execute(sql) rows = cur.fetchall() cur.close() conn.close() return rows
def test_connection(): conn = connect(host='nuc02', port=21050) cur = conn.cursor() cur.execute('USE tpcds_1') cur.execute('SHOW TABLES') print cur.fetchall()
def get_histogram_stacked(table_name, x_axis_column_name, y_axis_column_name, series_column_name=None): conn = connect(host='node1.allende.bigkore.com', port=21050) cursor = conn.cursor() query = "select %s, %s, %s from %s" % (x_axis_column_name, y_axis_column_name, series_column_name, table_name) print query cursor.execute(query) results = cursor.fetchall() x_axis = sorted(list(set([r[0] for r in results]))) series_dict = defaultdict(lambda: []) for r in results: series_dict[r[2]].append([r[0], float(r[1])]) series = [] for k, xy_values in sorted(series_dict.items(), key=lambda kv: kv[0]): x_values = set([x[0] for x in xy_values]) for x in x_axis: if x not in x_values: xy_values.append([x, 0]) y_values = [xy[1] for xy in sorted(xy_values, key=lambda xy: xy[0])] series.append({ 'name': k, 'data': y_values }) return json.dumps({ 'series': series, 'x_axis': x_axis })
def __init__(self): # start the connection self._conn = connect(host=hive_host, port=10000, auth_mechanism=hive_auth_mechanism, user=hive_user, password=hive_PW, database='default', kerberos_service_name=hive_kerberos_service_name) logging.info('Created db connection') # open the cursor self._cur = self._conn.cursor() logging.info('Created db cursor') # if there are any hive settings in the config.yml file, apply those with the "set" command if 'hive_settings' in cfg['hdfs'] and cfg['hdfs'][ 'hive_settings'] is not None: for setting in cfg['hdfs']['hive_settings']: if setting != '': self._cur.execute('set ' + setting) logging.info('Added hive settings') # if there are any jar files mentioned in the config.yml file, apply those with the "add jar" command if 'hive_jars' in cfg['hdfs'] and cfg['hdfs']['hive_jars'] is not None: for setting in cfg['hdfs']['hive_jars']: if setting != '': self._cur.execute('add jar ' + setting) logging.info('Added hive jars')
def connect(self): _base_ = {"host": self.get_host(), "port": self.get_port() } _base_.update(**self.kwargs) self._db = connect(**_base_) self.cursor = self._cursor()
def get_hs2_impala_cursor(impalad, use_kerberos=False, database=None): """Get a cursor to an impalad Args: impalad: A string in form 'hostname:port' or 'hostname' use_kerberos: boolean indication whether to get a secure connection. database: default db to use in the connection. Returns: HiveServer2Cursor if the connection suceeds, None otherwise. """ try: host, port = impalad.split(":") except ValueError: host, port = impalad, DEFAULT_HS2_PORT cursor = None try: conn = connect(host=host, port=port, database=database, auth_mechanism="GSSAPI" if use_kerberos else "NOSASL") cursor = conn.cursor() LOG.info("Connected to {0}:{1}".format(host, port)) except Exception, e: LOG.error("Error connecting: {0}".format(str(e)))
def __init__(self, dttime, appid=0, platform=0): self.dttime = dttime self.appid = appid self.platform = platform self.hive_conn = HiveQuery() self.roomonline_hql_arr = [] self.conn, self.cursor = conn2mysql(hconfig.mysqlhost, hconfig.mysqlport, hconfig.mysqluser, hconfig.mysqlpwd, hconfig.mysqldatabase) self.impalaconn = connect(host=hconfig.impalahost) self.impalacur = self.impalaconn.cursor() self.impalacur.execute("refresh api_db_user_log") self.impalacur.execute("refresh api_db_user_info") self.impalacur.execute("refresh room_inout") self.current_date = datetime.datetime.strptime(self.dttime, "%Y%m%d").date() self.now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) self.dttime_begin = ((self.dttime)[0:6] + '01') self.activeuser_hql = '' self.un_activenewuser_hql = '' self.newuser_hql = '' self.whole_user_hql = '' self.visitnum_hql = '' self.platformSql = '' self.vistnum_web_hql = '' self.watch_time_90_hql = "" self.watch_time_300_hql = "" self.hql_generater()
def get_hive_conn(creds, server='hiveserver2.idx.expedmz.com', port=10001, db="lz") : # Import in function to avoid useless dependecies for project that doesnt use Hive) from impala.dbapi import connect # To connect to Hive return connect(host=server, port=port, database=db, auth_mechanism="PLAIN", user=creds['HiveServer2']['user'], password=creds['HiveServer2']['pass'])
def query_impala_cursor(sql, params=None): conn = connect(host=current_app.config['IMPALA_HOST'], port=current_app.config['IMPALA_PORT'], SECURITY_TOKEN=current_app.config['SECURITY_TOKEN']) cursor = conn.cursor() cursor.execute(sql.encode('utf-8'), params) return cursor
def get_connnection(self): """Load HS2 client. Further calls to this method for the same alias will return the same client instance (in particular, any option changes to this alias will not be taken into account). """ params = self.module.params auth_mechanism = params.get('authentication') user = params.get('user', None) password = params.get('password', None) kerberos_service_name = params.get('kerberos_service_name', None) host = params.get('host') port = params.get('port') use_ssl = params.get('verify', None) ca_cert = params.get('truststore', None) timeout = params.get('timeout', None) try: connnection = hs2.connect( host=host, port=port, auth_mechanism=auth_mechanism, user=user, password=password, kerberos_service_name=kerberos_service_name, use_ssl=use_ssl, ca_cert=ca_cert, timeout=timeout) except socket.error, e: self.module.fail_json(msg="Failed to open socket, %s." % str(e))
def _connect(self): return connect(database=self.db, user=self.user, password=self.pwd, host=self.host, port=self.port, auth_mechanism=self.auth)
def run_hive(configData: ConfigData): a_client = InsecureClient(url=configData.hdfs_ip(), user="******") # "http://10.2.201.197:50070" conn = connect(host=configData.hive_ip(), port=configData.hive_port(), auth_mechanism=configData.hive_auth(), user=configData.hive_user()) cur = conn.cursor() f_date_str = configData.get_f_date() # "20181101" p_date_str = configData.get_p_date() # "2018-11-01" # hdfs_dir_bl root_path = str(pathlib.PurePosixPath(configData.get_hdfs_path()).joinpath(f_date_str)) file_name = str(pathlib.PurePosixPath(root_path).joinpath(configData.get_file_name(f_date_str))) # "/data/posflow/allinpay_utf8_zc/20181101/" # 20181101_loginfo_rsp_bl_new.csv # 20181101_rsp_agt_bl_new.del # 20181101_rxinfo_rsp_bl.txt table_name = configData.get_table_name() print("Start\n") if MyHdfsFile.isfile(a_client, file_name): if not configData.get_has_partition(): sql = "LOAD DATA INPATH '{}' INTO TABLE {}".format(file_name, table_name) # 'test.t1_trxrecprd_v2_zc' # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2' else: sql = "LOAD DATA INPATH '{}' INTO TABLE {} PARTITION ( p_date='{}' )".format(file_name, table_name, p_date_str) # 'test.t1_trxrecprd_v2_zc' print("OK" + " " + sql+"\n") cur.execute(sql) # , async=True) cur.close() conn.close()
def test_connection(): conn = connect(host="nuc02", port=21050) cur = conn.cursor() cur.execute("USE tpcds_1") cur.execute("SHOW TABLES") print cur.fetchall()
def impala_query(sql): impala_HMS_HOST=os.getenv('IMPALA_HOST','url') impala = connect(host=impala_HME_HOST, port=21050, use_ssl=False, auth_mechanism='GSSAPI', kerberos_service_name='impala') proc_start = time.time() # Time Check #-------------------------- impala_cursor = impala.cursor() impala_cursor.execute(sql) df = as_pandas(impala_cursor) #-------------------------- proc_end = time.time() columns = df.shape[1] nrows=len(df) # Calculation #-------------------------- memory_usage = round(sum(df.memory_usage(deep=True))/1024**2,6) read_time = round(proc_end - proc_start,6) impala.close() return df
def hive_query(sql): HIVE_HMS_HOST = os.getenv('HIVE_HS2_HOST','url') hive = connect(host=HIVE_HMS_HOST, port=10000, use_ssl=False, auth_mechanism='GSSAPI', kerberos_service_name='hive') proc_start = time.time() # Time Check #-------------------------- hive_cursor = hive.cursor() hive_cursor.execute(sql) df = as_pandas(hive_cursor) #-------------------------- proc_end =time.time() columns = df.shape[1] nrows=len(df) # Calculation #-------------------------- memory_usage = round(sum(df.memory_usage(deep=True))/1024**2,6) read_time = round(proc_end - proc_start,6) hive.close() return df
def get_conn(self, schema=None): db = self.get_connection(self.hiveserver2_conn_id) auth_mechanism = db.extra_dejson.get('authMechanism', 'PLAIN') kerberos_service_name = None if configuration.conf.get('core', 'security') == 'kerberos': auth_mechanism = db.extra_dejson.get('authMechanism', 'GSSAPI') kerberos_service_name = db.extra_dejson.get('kerberos_service_name', 'hive') # impyla uses GSSAPI instead of KERBEROS as a auth_mechanism identifier if auth_mechanism == 'KERBEROS': self.log.warning( "Detected deprecated 'KERBEROS' for " "authMechanism for %s. Please use 'GSSAPI' instead", self.hiveserver2_conn_id ) auth_mechanism = 'GSSAPI' from impala.dbapi import connect return connect( host=db.host, port=db.port, auth_mechanism=auth_mechanism, kerberos_service_name=kerberos_service_name, user=db.login, database=schema or db.schema or 'default')
def test_hive_plain_connect(self): self.connection = connect(ENV.host, ENV.hive_port, auth_mechanism="PLAIN", timeout=5, user=ENV.hive_user, password="******") self._execute_queries(self.connection)
def main(): # CONNECTION TO DATABASE conn = connect(host='your hive db host Ip/hostname', port=10000, auth_mechanism='PLAIN', user='******', password='******', database='xxx') c = conn.cursor() # QUERIES query_1 = """ SELECT * FROM your_table """ query_2 = """ SELECT * FROM your_table_2 a where to_date(timestamp) = current_date group by x """ # EXECUTE QUERIES # EXE TOTAL c.execute(query_1) result_1 = c.fetchall() # EXE TOTAL PER REGION c.execute(query_2) result_2 = c.fetchall() fields = ['field_1', 'field_2', 'field_3'] # CONVERT DATA tuple TO dictionary total_per_region = [dict(zip(fields, d)) for d in reg] # RETURN VALUES TO TEMPLATE return render_template('index.html', result1=result_1, result2=result_2)
def __init__(self, host, port): try: self.conn = connect(host=host, port=port) except: error_msg = 'impala connect error: Cannot connect to server!' print(error_msg) self.cur = self.conn.cursor()
def data2hive(): # 读取配置文件 conf = ConfigParser.ConfigParser() conf.read('config.ini') ip = conf.get("hive", "ip") tablename = conf.get("hive", "tablename") db = conf.get("hive", "db") # 创建数据库连接 conn = connect(host=ip, port=10000, database=db, auth_mechanism='PLAIN') # 创建cursor cursor = conn.cursor() try: sql = "select distinct questno,txt_content_url from " + tablename cursor.execute(sql) result = cursor.fetchall() count = 0 if result is None: print("hive dose not have data!") return for row in result: queue_data.put(row) count += 1 print('count:' + str(count)) except Exception as ex: print(ex) finally: cursor.close() conn.close()
def search_info(request): if request.method == 'GET': return render(request,"TestModel/hello.html",{"hello":"pass"}) elif request.method =='POST': try: #接收请求的信息 info=request.body data4info = json.loads(info) sql=doJoinSql(data4info) conn = connect(host='130.10.7.108', port=21050) cursor = conn.cursor() cursor.execute('USE rk') cursor.execute(sql) #查询信息结果 results = cursor.fetchall() print(results) ret={ 'msg': testmsg(results), 'code': testcode(results), 'data': results2json(data4info,results) } #return JsonResponse(ret,safe=False) except Exception as e: print("**************************************") print(e) print("**************************************") print("^上述异常发生了^") else: print("**************************************") print("一 切 OK") print("**************************************") finally: return JsonResponse(ret,safe=False) else: return "请求方法错误"
def connect_impala(): conn = connect(host='172.31.2.214', port=25000, user='******', password='******') cur = conn.cursor(user='******') return cur
def __init__(self,host = '10.18.0.19',port=21050): #logging.basicConfig(level=logging.INFO) logger.info(host) logger.info(port) self.conn = connect(host=host,port=port,database='default') self.cur = self.conn.cursor()
def _getting_data_from_hive(self): log_instance.info("hive-1") # getting data from hive s1 = "select distinct shop_id, t.shop_name from tmp.tmp_offline_dianping_shop_all_category t %s" % \ bc_config.cateName2WhereCondition_Dict[self.cate_name] inc_data_file = bc_config.cateName2Dir_Dict[ self.cate_name] + "/inc_data/inc_data.txt" conn = None cur = None try: conn = connect(host='172.20.207.6', port=10000, auth_mechanism="PLAIN") cur = conn.cursor() log_instance.info("hive-2") log_instance.info("hive-sql: %s" % s1) cur.execute(s1) lst1 = [] results = cur.fetchall() log_instance.info("hive-3") for d in results: lst1.append("%s\t%s" % (d[0], d[1].replace("\t", " "))) with open(inc_data_file, "w") as f1: f1.write("\n".join(lst1)) f1.flush() log_instance.info("hive-4") except Exception as e: log_instance.error(traceback.format_exc()) raise e finally: if cur != None: cur.close() if conn != None: conn.close() return 0
def run_remove_hive(conf: ConfigData, the_date: str, delta_day=0): f_date_str = StrTool.get_the_date_str(the_date, delta_day) # "20181101" # "/user/hive/warehouse/rds_posflow.db/t1_trxrecprd_v2/t1_trxrecord_20181204_V2*.csv" del_table = conf.get_table_name( ) # hive_table="rds_posflow.t1_trxrecprd_v2" if the_conf.m_project_id == 1: del_file = conf.get_file_name(f_date_str).replace('.', '*.') MyHdfsFile.delete_hive_ssh(conf.get_data("cdh_ip"), table=del_table, p_name=del_file, username=conf.get_data("cdh_user"), password=conf.get_data("cdh_pass")) if the_conf.m_project_id == 2: conn = connect(host=conf.hive_ip(), port=conf.hive_port(), auth_mechanism=conf.hive_auth(), user=conf.hive_user()) cur = conn.cursor() # "ALTER TABLE rds_posflow.t1_trxrecprd_v2_tmp DROP IF EXISTS PARTITION(p_date=20190208) " sql = "ALTER TABLE {} DROP IF EXISTS PARTITION( p_date={} )".format( del_table, the_date) print(sql) cur.execute(sql) cur.close() conn.close()
def get_consumer_group_cheques(discount_card_id=_discount_card_id): conn = connect(host='node1.allende.bigkore.com', port=21050) cursor = conn.cursor() query = "select consumer_id, date1, group23, quantity from test.consumer_date_purchases_group23 where consumer_id='%s'" % (discount_card_id) print query cursor.execute(query) results = cursor.fetchall() results = [[str(r) for r in row] for row in results] conn.close() root_Node = id_name_Node('d=' + discount_card_id) date_node_dict = {} for row in results: date = row[1] date_node = None if date in date_node_dict: date_node = date_node_dict[date] else: date_node = id_name_Node('date=' + date) date_node_dict[date] = date_node good_node = id_name_Node('' + row[2]) good_node.children = [ id_name_Node('quantity=' + row[3]), ] date_node.children.append(good_node) root_Node.children += sorted(date_node_dict.values(), cmp=lambda x, y: - cmp(x.name, y.name)) return json.dumps(_to_json(root_Node))
def run_query(cls, host_name, table, query, op='select'): """Execute a impala query. Args: host_name: host name for impala connection table: database.table query: impala query op: indicator for select""" global IMPALA_CONN result = '' if not IMPALA_CONN: IMPALA_CONN = connect(host=host_name, port=25003, timeout=600, use_kerberos=True) cur = IMPALA_CONN.cursor() # Can't invalidate something that doesn't yet exist if op != "create": ImpalaConnect.invalidate_metadata(host_name, table) cur.execute(query, configuration={'request_pool': 'ingestion'}) if cur: if op == 'select': result = cur.fetchall() else: raise "Hive connection - cursor is none" return result
def impala_query(sql): conn = connect(**impala_config) cur = conn.cursor() cur.execute(sql) df = as_pandas(cur) conn.close() return df
def conn(self): return connect(host='192.168.1.66', port=10000, user='******', password='******', database='default', auth_mechanism='PLAIN')
def create_connection(): host, port = config.hive() conf = {} # TODO: if using hive, kerberos service name must be changed, impyla sets 'impala' as default conf.update({'kerberos_service_name': 'hive'}) if config.kerberos_enabled(): principal, keytab, sasl_mech, security_proto = config.kerberos() conf.update({ 'auth_mechanism': 'GSSAPI', }) else: conf.update({ 'auth_mechanism': 'PLAIN', }) if config.ssl_enabled(): ssl_verify, ca_location, cert, key = config.ssl() if ssl_verify.lower() == 'false': conf.update({'use_ssl': ssl_verify}) else: conf.update({'ca_cert': cert, 'use_ssl': ssl_verify}) db = config.db() conn = connect(host=host, port=int(port), database=db, **conf) return conn.cursor()
def conn(): return connect(host='master.mycdh.com', port=21050, database='movie_shop', timeout=60, user='******', password='******')
def update_raw_stage(output, delivery_tag): #context = zmq.Context() #confirm = context.socket(zmq.PUSH) #confirm.connect(confirm_host) hdfs = PyWebHdfsClient(host=webhdfs_host, port=webhdfs_port, user_name=webhdfs_user) impala_conn = connect(host=impala_host, port=int(impala_port)) cur = impala_conn.cursor() start_time = time.time() for k, v in output.iteritems(): if (time.time() - start_time)/60 > sink_minutes: sink_logger.warning('ETL process running longer then sink timeout: {0} minutes'.format((time.time() - start_time)/60)) try: file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k) hdfs.append_file(file_name, '\n'.join(v)) cur.execute('refresh test_log_{0}'.format(k)) except hdfs_err.PyWebHdfsException: file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k) hdfs.create_file(file_name, '') hdfs.append_file(file_name, '\n'.join(v)) cur.execute('refresh test_log_{0}'.format(k)) #confirm.send(delivery_tag) sink_logger.info('ETL process finished for {0} minutes'.format((time.time() - start_time)/60)) sink_logger.info('ETL process finished with {0} delivery_tag'.format(delivery_tag))
def run_sql(sql): conn = connect(host='172.17.69.25', auth_mechanism='PLAIN', port=21050, user='******', password='******') cursor = conn.cursor() cnt = 1 # 2.2 对impala执行SQL查询 if ';' in sql: sql_list = sql.rstrip().split(';') # print(type(sql_list)) if len(sql_list[-1]): for s in sql_list: print("runing sql @ %s" % cnt) cursor.execute(s) cnt += 1 else: sql_list.pop() for s in sql_list: print("runing sql @ %s" % cnt) cursor.execute(s) cnt += 1 else: print("runing sql @ %s" % cnt) cursor.execute(sql) # 2.3 把结果转化成pandas的dataframe格式,以便进行数据分析 # df = as_pandas(cursor) # print(df) return as_pandas(cursor) if cursor.description != None else 'null'
def impala_connect(sql, **kwargs): # impala host = kwargs.get("host", 'impala.bjds.belle.lan') port = kwargs.get("port", 21051) timeout = kwargs.get("timeout", 3600) # hive # host = kwargs.get("host", 'impala.bjds.belle.lan') # port = kwargs.get("port", 10008) # timeout = kwargs.get("timeout", 3600) user = kwargs.get("user", "lv.d.sz") password = kwargs.get("password", 'JHjLXpyQ') kerberos_service_name = kwargs.get("kerberos_service_name", "impala") conn = connect(host=host, port=port, timeout=timeout, user=user, password=password, kerberos_service_name=kerberos_service_name, auth_mechanism='LDAP') cur = conn.cursor(user=user) if sql is not None: cur.execute(sql) try: df = as_pandas(cur) except: return cur return df
def batching_data(): input_date = '2015-01-03' # Connection to database conn = connect(host='localhost', port=10000, auth_mechanism='PLAIN', user='******', password='******', database='gdelt_db') c = conn.cursor() # Avg per country from an initial date query = """ SELECT ActionGeo_CountryCode, AVG(AvgTone) FROM gdelt_temporal where to_date(SQLDATE) >= {} group by ActionGeo_CountryCode """ # Execution of query c.execute(query.format(input_date)) result_set = c.fetchall() # From list to dictionary header = ['country', 'avg_tone'] avg_per_country = [dict(zip(header, c)) for c in cou] return jsonify(result_set)
def run_hive(conf: ConfigData, the_date: str): client = Client(conf.hdfs_ip()) # "http://10.2.201.197:50070" conn = connect(host=conf.hive_ip(), port=conf.hive_port(), auth_mechanism=conf.hive_auth(), user=conf.hive_user()) cur = conn.cursor() the_date = StrTool.get_the_date_str(the_date) # "20181101" root_path = conf.get_data("hdfs_dir_zc") # "/data/posflow/allinpay_utf8_zc/" file_ext3 = conf.get_data("file_ext3") # _loginfo_rsp.txt # 20181101_loginfo_rsp.txt file_ext4 = conf.get_data("file_ext4") # _loginfo_rsp_agt.txt # 20181101_loginfo_rsp_agt.txt file_ext5 = conf.get_data("file_ext5") # _rxinfo_rsp.txt # 20181101_rxinfo_rsp.txt file_ext6 = conf.get_data("file_ext6") # _rxinfo_rsp_agt.txt # 20181101_rxinfo_rsp_agt.txt print("Start\n") file3 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext3)) file4 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext4)) file5 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext5)) file6 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext6)) f_list = [file3,file4,file5,file6] t_list = ["hive_table3", "hive_table4", "hive_table5", "hive_table6"] for n in range(0,4): if MyHdfsFile.isfile(client, f_list[n]): sql = 'LOAD DATA INPATH \'' + f_list[n] + '\' INTO TABLE ' + conf.get_data(t_list[n]) # 'test.t1_trxrecprd_v2_zc' # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2' print("OK" + " " + sql+"\n") cur.execute(sql) # , async=True) cur.close() conn.close()
def hive_query(sql): conn = connect(**hive_config) cur = conn.cursor() cur.execute(sql) data = cur.fetchall() conn.close() return pd.DataFrame(data)
def open_connection(self): try: self.connection = connect(host=self.host, port=self.port) logger.info('Connected to HiveServer2 at %s:%s' % (self.host, self.port)) except TTransportException: logger.error('Unable to open connection to HiveSever2 at %s:%s' % (self.host, self.port)) sys.exit(2) self.cursor = self.connection.cursor()
def get_impala_connection(): while True: try: conn = connect(host='al1.zmeke.com', port=21050) return conn except TTransportException as e: print e.message time.sleep(1) pass
def connect(self, host, port): ''' :param host: Impala server hostname :param port: Impala server port :return: impyla cursor ''' conn = connect(host=self.host, port=self.port) cursor = conn.cursor() return cursor
def main(): logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', filename='/etl/logs/install_active_log.log', level=logging.DEBUG) query_time = get_query_time() logging.info('Time for query: {0}'.format(query_time)) cnx = sql.connect(user='******', password="******", host='localhost', port=3301, database='matracking') cursor = cnx.cursor() delete_query = 'delete from install_active_log where active_timestamp > {0}'.format(query_time) cursor.execute(delete_query) logging.info('Rows deleted') logging.info('New max time: {0}'.format(get_new_max_time())) conn = connect(host='h1.stats.zmeke.com', port=21050) cur = conn.cursor() cur.execute("select ios_ifa, google_aid, windows_aid, unix_timestamp(p_tdate) as active_timestamp, installid, site_id, publisher_id, campaign_id, created" " from " "(select distinct installid, case when ios_ifa is null or ios_ifa='' or ios_ifa='NF' then " " upper(google_aid) else upper(ios_ifa) end as id, ios_ifa, google_aid, windows_aid, site_id, publisher_id, campaign_id, created," " case when ios_ifa is null or ios_ifa='' or ios_ifa='NF' then " " 'andr' else 'ios' end as type " " from analytics.installs where campaign_id<>'0') t1" " inner join " " (select distinct case when device_ids_idfa_am2 is null or device_ids_idfa_am2='' or device_ids_idfa_am2='NF' then " " upper(device_ids_google_aid_am2) else upper(device_ids_idfa_am2) end as id, p_tdate " " from r12.log_1 " " where unix_timestamp(p_tdate)>={0}) t2" " on t1.id=t2.id".format(query_time)) impala_data = cur.fetchall() logging.info('Fetched rows from impala: {0}'.format(len(impala_data))) count = 0 numb = 0 add_row = '' for i in impala_data: if numb < 1000: count += 1 numb += 1 add_row += "INSERT INTO matracking.install_active_log (ios_ifa, google_aid, windows_aid, active_timestamp, installid, site_id, publisher_id, campaign_id, created) VALUES %s; " % str(i) else: for result in cursor.execute(add_row, multi=True): pass cnx.commit() count += 1 numb = 1 add_row = "INSERT INTO matracking.install_active_log (ios_ifa, google_aid, windows_aid, active_timestamp, installid, site_id, publisher_id, campaign_id, created) VALUES %s; " % str(i) for result in cursor.execute(add_row, multi=True): pass cnx.commit() cursor.close() cnx.close() logging.info('Rows inserted: {0}'.format(count))
def con(host, port, auth_mech, tmp_db): # create the temporary database con = connect(host=host, port=port, auth_mechanism=auth_mech) cur = con.cursor() cur.execute("CREATE DATABASE {0}".format(tmp_db)) cur.close() con.close() # create the actual fixture con = connect(host=host, port=port, auth_mechanism=auth_mech, database=tmp_db) yield con con.close() # cleanup the temporary database con = connect(host=host, port=port, auth_mechanism=auth_mech) cur = con.cursor() force_drop_database(cur, tmp_db) cur.close() con.close()
def db_connect(self): '''Funcion que conecta a hive con datos config.py regresando conector y cursor''' conn = connect(host=DbD['host'], port=DbD['port'], user=DbD['user'], password=DbD['password']) cursor = conn.cursor() return conn, cursor
def _execute_actions(self): with dbapi.connect(host=self.server, port=self.port, user=self.user, database=self.database, auth_mechanism="PLAIN") as conn: cursor = conn.cursor() for action in self.hql_actions: logger.debug("Executing hql action:\n{0}\n".format(action)) cursor.execute(action)
def run(self): cnx = connect(host=self.host, port=self.port) cursor = cnx.cursor() try: while True: work_item = self.q.get(block=False) self.do_work(cursor, work_item) except Empty: pass finally: cnx.close()
def get_impala_connection(self): while True: try: impala_conn = connect(host=impala_host, port=int(impala_port)) cur_impala = impala_conn.cursor() cur_impala.execute("refresh default.fail_test;") return impala_conn except TTransportException as e: logging.exception("Failed to connect to Impala") logging.exception(e.message) time.sleep(10)
def get_impala_connection(self): while True: try: impala_conn = connect(host=self.impala_host, port=int(self.impala_port)) cur_impala = impala_conn.cursor() cur_impala.execute('refresh default.fail_test;') return impala_conn except TTransportException as e: self.sink_logger.exception("""Failed to connect to Impala""") self.sink_error_logger.exception("""Failed to connect to Impala with error {error}""".format(error=e.message)) time.sleep(10)
def get_impala_connection(self): while True: try: impala_conn = connect(host=impala_host, port=int(impala_port)) cur_impala = impala_conn.cursor() cur_impala.execute('refresh default.fail_test;') return impala_conn except TTransportException as e: sink_logger.exception('Failed to connect to Impala') sink_error_logger.exception(e.message) time.sleep(10)
def get(self): conn = connect(host = host_machine_ip_address, port=21050) cur = conn.cursor() command = self.get_argument('query') if "drop" in command.lower(): return cur.execute(command.encode('ascii','ignore').replace(";", "")) data = cur.fetchall() print data code = html.prettyDatabase(data) json_data = {"data": data, "code": code, "query": command} self.write(json_data)
def get_impala_connection(self): try: impala_conn = connect(host=self.impala_host, port=int(self.impala_port), timeout=30) cur_impala = impala_conn.cursor() cur_impala.execute('refresh default.fail_test;') self.sink_logger.info('Connected to Impala node {0}'.format(self.impala_host)) return True, impala_conn, cur_impala except TTransportException as e: self.sink_logger.exception("""Failed to connect to Impala: {0}""".format(self.impala_host)) self.sink_error_logger.exception("""Failed to connect to Impala with error {error}""".format(error=e.message)) try: impala_conn = connect(host=self.impala_host_reserve, port=int(self.impala_port), timeout=30) cur_impala = impala_conn.cursor() cur_impala.execute('refresh default.fail_test;') self.sink_logger.info('Connected to Impala node {0}'.format(self.impala_host_reserve)) return True, impala_conn, cur_impala except TTransportException as e: self.sink_logger.exception("""Failed to connect to Impala: {0}""".format(self.impala_host_reserve)) self.sink_error_logger.exception("""Failed to connect to Impala with error {error}""".format(error=e.message)) return False, False, False
def desc_total_sales_volumn(year): # Redis read cache value REDIS_KEY = "desc_total_sales_vol:{0}".format(year) cached_data = redis_io.read_transaction(REDIS_KEY) if cached_data != None: return ast.literal_eval(cached_data) # conn = connect(host='salest-master-server', port=21050) cur = conn.cursor() # daily transaction agg cur.execute('USE salest') bIsRealTimeUpdated = REFRESH_IMPALA_TABLE(cur, year) cur.execute(""" SELECT year_month_day, SUM(num_of_product) AS num_of_product, SUM(sales_amount) AS total_amount FROM ( SELECT SUBSTR(date_receipt_num,1,10) AS year_month_day, num_of_product, sales_amount FROM """ + GET_IMPALA_TB_NAME(year) + """ WHERE SUBSTR(date_receipt_num,1,4) = '""" + year + """' ) view_tr_recipt GROUP BY year_month_day ORDER BY year_month_day ASC """ ) df_tr_agg_daily = as_pandas(cur) conn.close() series_sum = df_tr_agg_daily[['num_of_product','total_amount']].sum() series_sum.name = 'sum' df_desc = df_tr_agg_daily.describe().append(series_sum) df_desc['num_of_product'] = df_desc['num_of_product'].apply(lambda v: round(v)) df_desc['total_amount'] = df_desc['total_amount'].apply(lambda v: round(v)) df_desc.fillna(0, inplace=True) cached_data = df_desc.to_dict() if bIsRealTimeUpdated == False: # Redis save cache value redis_io.write_transaction(REDIS_KEY, cached_data) # cached_data = redis_io.read_transaction(REDIS_KEY) cached_data = ast.literal_eval(cached_data) return cached_data
def _new_cursor(self): params = self.params.copy() con = impyla_dbapi.connect(database=self.database, **params) # make sure the connection works cursor = con.cursor() cursor.ping() wrapper = ImpalaCursor(cursor, self, self.database) if self.codegen_disabled: wrapper.disable_codegen(self.codegen_disabled) return wrapper
def get_timebase_data_on_past_specific_date(cur_date): # Redis read cache value REDIS_KEY_PREFIX = "past_timebase_data_of" def get_cache_value(cur_date): return redis_io.read_dict_transaction(REDIS_KEY_PREFIX + ":" + cur_date) cached_data = get_cache_value(cur_date) if cached_data != None: return cached_data conn = connect(host='salest-master-server', port=21050) cur = conn.cursor() cur.execute('USE salest') date_list = tuple(get_past_target_date(cur_date)) cur.execute( """ SELECT time_hour, CAST(SUM(sales_amount) as INTEGER) AS total_amount, COUNT(sales_amount) as num_of_transaction, COUNT(DISTINCT year_month_day) as date_count FROM( SELECT SUBSTR(date_receipt_num,1,10) AS year_month_day, SUBSTR(tr_time,1,2) AS time_hour, sales_amount FROM ext_tr_receipt WHERE SUBSTR(date_receipt_num,1,10) IN ('%s') """ % date_list + """ ) view_tr_total_amount_by_dayofweek GROUP BY time_hour ORDER BY time_hour ASC """ ) df_by_hour = as_pandas(cur) conn.close() df_by_hour.set_index('time_hour',inplace=True) df_by_hour = df_by_hour.reindex([[str(i) for i in np.arange(10,24)]],fill_value=0) dict_result = df_by_hour['total_amount'].to_dict() dict_result['date'] = date_list[0] redis_io.write_dict_transaction(REDIS_KEY_PREFIX + ":" + cur_date, dict_result, 60*60) ret_dict = get_cache_value(cur_date) return ret_dict
def run_testing(self): conn = connect(host='172.168.0.24', port=21050,database = 'tpcds_text_150') cursor = conn.cursor() count = 0 cur_task = 0 query_time = [] with open(os.path.join(self.profile_path,'RESULT'),'w') as fp_w: for query in self.query_list: if query not in self.des_dic.keys(): self.des_dic[query] = {} for cpufreq in self.cpufreq_range: if cpufreq not in self.des_dic[query].keys(): self.CPUFreqSet_OBJ.set(cpufreq) self.des_dic[query][cpufreq] = [] start_time = time.time() #for i in range(self.every_query_times): with open(os.path.join(local_config()['query_dir'],query),'r') as fp: sql= fp.read().strip('profile;\n') sql = sql.strip('; ') while True: query_start_time = time.time() cursor.execute('%s'%(sql)) end_time = time.time() query_time.append(end_time - query_start_time) while True: row=cursor.fetchone() if row: pass else: break cur_profile = cursor.get_profile() count = count + 1 with open(os.path.join(self.profile_path,str(count)+'.log'),'w') as fp_profile: fp_profile.write("%s"%(cur_profile)) if end_time - start_time > local_config()['duration_time'] : break self.des_dic[query][cpufreq].sort() print "%s,%s,%s"%(self.users_InTotal,self.user_name,count) #print >>self.logging,"%s,%s,%s"%(self.profile_path,self.user_name,count) fp_w.write("%s,%s,%s,%s"%(self.users_InTotal,self.user_name,count, query_time)) cursor.close() conn.close() return self.des_dic
def __init__(self, temp_dir=None, temp_db=None, nn_host=None, webhdfs_port=50070, hdfs_user=None, *args, **kwargs): # args and kwargs get passed directly into impala.dbapi.connect() suffix = _random_id(length=8) self._temp_dir = '/tmp/impyla-%s' % suffix if temp_dir is None else temp_dir self._temp_db = 'tmp_impyla_%s' % suffix if temp_db is None else temp_db self._conn = connect(*args, **kwargs) self._cursor = self._conn.cursor() # used for pywebhdfs cleanup of temp dir; not required self._nn_host = nn_host self._webhdfs_port = webhdfs_port self._hdfs_user = hdfs_user if temp_db is None: self._cursor.execute("CREATE DATABASE %s LOCATION '%s'" % (self._temp_db, self._temp_dir))
def connect(self): LOG.info("-- connecting to {0} with impyla".format(self.__host_port)) host, port = self.__host_port.split(":") self.__impyla_conn = impyla.connect(host=host, port=int(port)) LOG.info("Conn {0}".format(self.__impyla_conn)) # Get the default query options for the session before any modifications are made. self.__cursor = self.__impyla_conn.cursor() self.__cursor.execute("set all") self.__default_query_options = {} for name, val, _ in self.__cursor: self.__default_query_options[name] = val self.__cursor.close_operation() LOG.debug("Default query options: {0}".format(self.__default_query_options))