def execute_query(self, query_string):
        result_rows = []

        with connect(host=self.host, port=self.port, auth_mechanism=self.auth_mech, user=self.username,
                     password=self.password, database=self.database, timeout=self.timeout)as conn:
            with conn.cursor() as cur:
                try:
                    print "executing query"

                    # Execute query
                    cur.execute(query_string)

                    print "done executing query"

                    # Get column names
                    columns = cur.description

                    # Impyla library under conda (used in PCF) does not support ARRAY data type. Therefore in order to
                    # patch, we will treat array types as strings
                    if 'ARRAY' not in _TTypeId_to_TColumnValue_getters:
                        for index, val in enumerate(columns):
                            if val[1] == 'ARRAY':
                                cur._description[index] = (val[0], 'STRING', val[2], val[3], val[4], val[5], val[6])

                    # Fetch table results
                    for row in cur:
                        result_obj = {}
                        for index, val in enumerate(columns):
                            # Remove characters and dot which precedes column name for key values
                            result_obj[re.sub(r'.*[.]', '', val[0])] = row[index]
                        result_rows.append(result_obj)
                except Exception, e:
                    return e
Пример #2
1
def get_consumer_basket(discount_card_id=_discount_card_id, table_name=_table_name, query=None):
    conn = connect(host='node1.allende.bigkore.com', port=21050)
    cursor = conn.cursor()
    if not query:
        query = "select * from test.%s where discount_card_id='%s'" % (table_name, discount_card_id)
    cursor.execute(query)
    column_names = [c[0] for c in cursor.description]
    column_types = [get_internal_type_from_mssql_type(c[1]) for c in cursor.description]
    results = cursor.fetchall()

    def trim_if_string(i, v):
        if column_types[i] == 'string':
            return v.strip()
        return v

    results = [[trim_if_string(i, v) for i, v in enumerate(row)] for row in results]

    conn.close()

    json_result = {
        'col_names': column_names,
        'col_types': column_types,
        'data': results
    }

    return json.dumps(json_result)
Пример #3
1
    def run_query(self, query, user):

        connection = None
        try:
            connection = connect(**self.configuration.to_dict())

            cursor = connection.cursor()

            cursor.execute(query)

            column_names = []
            columns = []

            for column in cursor.description:
                column_name = column[COLUMN_NAME]
                column_names.append(column_name)

                columns.append({
                    'name': column_name,
                    'friendly_name': column_name,
                    'type': types_map.get(column[COLUMN_TYPE], None)
                })

            rows = [dict(zip(column_names, row)) for row in cursor]

            data = {'columns': columns, 'rows': rows}
            json_data = json.dumps(data, cls=JSONEncoder)
            error = None
            cursor.close()
        except DatabaseError as e:
            logging.exception(e)
            json_data = None
            error = e.message
        except RPCError as e:
            logging.exception(e)
            json_data = None
            error = "Metastore Error [%s]" % e.message
        except KeyboardInterrupt:
            connection.cancel()
            error = "Query cancelled by user."
            json_data = None
        except Exception as e:
            logging.exception(e)
            raise sys.exc_info()[1], None, sys.exc_info()[2]
        finally:
            if connection:
                connection.close()

        return json_data, error
Пример #4
0
    def fetch(self, sql):
        if self.ds.database:
            conn = connect(host=self.ds.host,
                           database=self.ds.database,
                           port=self.ds.port)
        else:
            conn = connect(host=self.ds.host, port=self.ds.port)
        cur = conn.cursor()
        cur.execute(sql)
        rows = cur.fetchall()

        cur.close()
        conn.close()

        return rows
Пример #5
0
def test_connection():
    conn = connect(host='nuc02', port=21050)
    cur = conn.cursor()

    cur.execute('USE tpcds_1')
    cur.execute('SHOW TABLES')
    print cur.fetchall()
Пример #6
0
def get_histogram_stacked(table_name, x_axis_column_name, y_axis_column_name, series_column_name=None):
    conn = connect(host='node1.allende.bigkore.com', port=21050)
    cursor = conn.cursor()
    query = "select %s, %s, %s from %s" % (x_axis_column_name, y_axis_column_name, series_column_name, table_name)
    print query
    cursor.execute(query)
    results = cursor.fetchall()
    x_axis = sorted(list(set([r[0] for r in results])))
    series_dict = defaultdict(lambda: [])
    for r in results:
        series_dict[r[2]].append([r[0], float(r[1])])
    series = []
    for k, xy_values in sorted(series_dict.items(), key=lambda kv: kv[0]):
        x_values = set([x[0] for x in xy_values])
        for x in x_axis:
            if x not in x_values:
                xy_values.append([x, 0])

        y_values = [xy[1] for xy in sorted(xy_values, key=lambda xy: xy[0])]
        series.append({
            'name': k,
            'data': y_values
        })

    return json.dumps({
        'series': series,
        'x_axis': x_axis
    })
Пример #7
0
    def __init__(self):
        # start the connection
        self._conn = connect(host=hive_host,
                             port=10000,
                             auth_mechanism=hive_auth_mechanism,
                             user=hive_user,
                             password=hive_PW,
                             database='default',
                             kerberos_service_name=hive_kerberos_service_name)
        logging.info('Created db connection')

        # open the cursor
        self._cur = self._conn.cursor()
        logging.info('Created db cursor')

        # if there are any hive settings in the config.yml file, apply those with the "set" command
        if 'hive_settings' in cfg['hdfs'] and cfg['hdfs'][
                'hive_settings'] is not None:
            for setting in cfg['hdfs']['hive_settings']:
                if setting != '':
                    self._cur.execute('set ' + setting)
        logging.info('Added hive settings')

        # if there are any jar files mentioned in the config.yml file, apply those with the "add jar" command
        if 'hive_jars' in cfg['hdfs'] and cfg['hdfs']['hive_jars'] is not None:
            for setting in cfg['hdfs']['hive_jars']:
                if setting != '':
                    self._cur.execute('add jar ' + setting)
        logging.info('Added hive jars')
Пример #8
0
 def connect(self):
     _base_ = {"host": self.get_host(),
               "port": self.get_port()
               }
     _base_.update(**self.kwargs)
     self._db = connect(**_base_)
     self.cursor = self._cursor()
def get_hs2_impala_cursor(impalad, use_kerberos=False, database=None):
  """Get a cursor to an impalad

  Args:
    impalad: A string in form 'hostname:port' or 'hostname'
    use_kerberos: boolean indication whether to get a secure connection.
    database: default db to use in the connection.

  Returns:
    HiveServer2Cursor if the connection suceeds, None otherwise.
  """
  try:
    host, port = impalad.split(":")
  except ValueError:
    host, port = impalad, DEFAULT_HS2_PORT
  cursor = None
  try:
    conn = connect(host=host,
        port=port,
        database=database,
        auth_mechanism="GSSAPI" if use_kerberos else "NOSASL")
    cursor = conn.cursor()
    LOG.info("Connected to {0}:{1}".format(host, port))
  except Exception, e:
    LOG.error("Error connecting: {0}".format(str(e)))
Пример #10
0
 def __init__(self, dttime, appid=0, platform=0):
     self.dttime = dttime
     self.appid = appid
     self.platform = platform
     self.hive_conn = HiveQuery()
     self.roomonline_hql_arr = []
     self.conn, self.cursor = conn2mysql(hconfig.mysqlhost, hconfig.mysqlport, hconfig.mysqluser, hconfig.mysqlpwd,
                                         hconfig.mysqldatabase)
     self.impalaconn = connect(host=hconfig.impalahost)
     self.impalacur = self.impalaconn.cursor()
     self.impalacur.execute("refresh api_db_user_log")
     self.impalacur.execute("refresh api_db_user_info")
     self.impalacur.execute("refresh room_inout")
     self.current_date = datetime.datetime.strptime(self.dttime, "%Y%m%d").date()
     self.now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
     self.dttime_begin = ((self.dttime)[0:6] + '01')
     self.activeuser_hql = ''
     self.un_activenewuser_hql = ''
     self.newuser_hql = ''
     self.whole_user_hql = ''
     self.visitnum_hql = ''
     self.platformSql = ''
     self.vistnum_web_hql = ''
     self.watch_time_90_hql = ""
     self.watch_time_300_hql = ""
     self.hql_generater()
Пример #11
0
def get_hive_conn(creds, server='hiveserver2.idx.expedmz.com', port=10001, db="lz") :
    # Import in function to avoid useless dependecies for project that doesnt use Hive)
    from impala.dbapi import connect # To connect to Hive 
    
    return connect(host=server, port=port, 
                    database=db, auth_mechanism="PLAIN", 
                    user=creds['HiveServer2']['user'], password=creds['HiveServer2']['pass'])
Пример #12
0
def query_impala_cursor(sql, params=None):
    conn = connect(host=current_app.config['IMPALA_HOST'],
                   port=current_app.config['IMPALA_PORT'],
                   SECURITY_TOKEN=current_app.config['SECURITY_TOKEN'])
    cursor = conn.cursor()
    cursor.execute(sql.encode('utf-8'), params)
    return cursor
Пример #13
0
    def get_connnection(self):
        """Load HS2 client.

        Further calls to this method for the same alias will return the same client
        instance (in particular, any option changes to this alias will not be taken
        into account).

        """
        params = self.module.params
        auth_mechanism = params.get('authentication')
        user = params.get('user', None)
        password = params.get('password', None)
        kerberos_service_name = params.get('kerberos_service_name', None)
        host = params.get('host')
        port = params.get('port')
        use_ssl = params.get('verify', None)
        ca_cert = params.get('truststore', None)
        timeout = params.get('timeout', None)

        try:
            connnection = hs2.connect(
                host=host,
                port=port,
                auth_mechanism=auth_mechanism,
                user=user,
                password=password,
                kerberos_service_name=kerberos_service_name,
                use_ssl=use_ssl,
                ca_cert=ca_cert,
                timeout=timeout)
        except socket.error, e:
            self.module.fail_json(msg="Failed to open socket, %s." % str(e))
Пример #14
0
 def _connect(self):
     return connect(database=self.db,
                    user=self.user,
                    password=self.pwd,
                    host=self.host,
                    port=self.port,
                    auth_mechanism=self.auth)
def run_hive(configData: ConfigData):
    a_client = InsecureClient(url=configData.hdfs_ip(), user="******")  # "http://10.2.201.197:50070"
    conn = connect(host=configData.hive_ip(), port=configData.hive_port(), auth_mechanism=configData.hive_auth(), user=configData.hive_user())
    cur = conn.cursor()

    f_date_str = configData.get_f_date()  # "20181101"
    p_date_str = configData.get_p_date()  # "2018-11-01"

    # hdfs_dir_bl
    root_path = str(pathlib.PurePosixPath(configData.get_hdfs_path()).joinpath(f_date_str))
    file_name = str(pathlib.PurePosixPath(root_path).joinpath(configData.get_file_name(f_date_str)))
    # "/data/posflow/allinpay_utf8_zc/20181101/"
    # 20181101_loginfo_rsp_bl_new.csv
    # 20181101_rsp_agt_bl_new.del
    # 20181101_rxinfo_rsp_bl.txt

    table_name = configData.get_table_name()

    print("Start\n")

    if MyHdfsFile.isfile(a_client, file_name):
        if not configData.get_has_partition():
            sql = "LOAD DATA INPATH '{}' INTO TABLE {}".format(file_name, table_name)  # 'test.t1_trxrecprd_v2_zc'
            # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2'
        else:
            sql = "LOAD DATA INPATH '{}' INTO TABLE {} PARTITION ( p_date='{}' )".format(file_name, table_name, p_date_str)  # 'test.t1_trxrecprd_v2_zc'
        print("OK" + "  " + sql+"\n")
        cur.execute(sql)  # , async=True)

    cur.close()
    conn.close()
Пример #16
0
def test_connection():
    conn = connect(host="nuc02", port=21050)
    cur = conn.cursor()

    cur.execute("USE tpcds_1")
    cur.execute("SHOW TABLES")
    print cur.fetchall()
Пример #17
0
def impala_query(sql):

    impala_HMS_HOST=os.getenv('IMPALA_HOST','url')
	impala = connect(host=impala_HME_HOST,
	port=21050,
	use_ssl=False,
	auth_mechanism='GSSAPI',
	kerberos_service_name='impala')
	
	proc_start = time.time()
	# Time Check
	#--------------------------
	impala_cursor = impala.cursor()
	impala_cursor.execute(sql)
	df = as_pandas(impala_cursor)
	#--------------------------
	proc_end = time.time()


	columns = df.shape[1]
	nrows=len(df)
	
	# Calculation
	#--------------------------
	memory_usage = round(sum(df.memory_usage(deep=True))/1024**2,6)
	read_time = round(proc_end - proc_start,6)
	
	impala.close()
	return df
Пример #18
0
def hive_query(sql):

    HIVE_HMS_HOST = os.getenv('HIVE_HS2_HOST','url')
	hive = connect(host=HIVE_HMS_HOST,
	port=10000,
	use_ssl=False,
	auth_mechanism='GSSAPI',
	kerberos_service_name='hive')
	
	proc_start = time.time()
	# Time Check
	#--------------------------
	hive_cursor = hive.cursor()
	hive_cursor.execute(sql)
	df = as_pandas(hive_cursor)
	#--------------------------
	proc_end =time.time()


	columns = df.shape[1]
	nrows=len(df)
	
	# Calculation
	#--------------------------
	memory_usage = round(sum(df.memory_usage(deep=True))/1024**2,6)
	read_time = round(proc_end - proc_start,6)
	
	hive.close()
	return df
Пример #19
0
    def get_conn(self, schema=None):
        db = self.get_connection(self.hiveserver2_conn_id)
        auth_mechanism = db.extra_dejson.get('authMechanism', 'PLAIN')
        kerberos_service_name = None
        if configuration.conf.get('core', 'security') == 'kerberos':
            auth_mechanism = db.extra_dejson.get('authMechanism', 'GSSAPI')
            kerberos_service_name = db.extra_dejson.get('kerberos_service_name', 'hive')

        # impyla uses GSSAPI instead of KERBEROS as a auth_mechanism identifier
        if auth_mechanism == 'KERBEROS':
            self.log.warning(
                "Detected deprecated 'KERBEROS' for "
                "authMechanism for %s. Please use 'GSSAPI' instead",
                self.hiveserver2_conn_id
            )
            auth_mechanism = 'GSSAPI'

        from impala.dbapi import connect
        return connect(
            host=db.host,
            port=db.port,
            auth_mechanism=auth_mechanism,
            kerberos_service_name=kerberos_service_name,
            user=db.login,
            database=schema or db.schema or 'default')
Пример #20
0
 def test_hive_plain_connect(self):
     self.connection = connect(ENV.host, ENV.hive_port,
                               auth_mechanism="PLAIN",
                               timeout=5,
                               user=ENV.hive_user,
                               password="******")
     self._execute_queries(self.connection)
Пример #21
0
def get_hs2_impala_cursor(impalad, use_kerberos=False, database=None):
    """Get a cursor to an impalad

  Args:
    impalad: A string in form 'hostname:port' or 'hostname'
    use_kerberos: boolean indication whether to get a secure connection.
    database: default db to use in the connection.

  Returns:
    HiveServer2Cursor if the connection suceeds, None otherwise.
  """
    try:
        host, port = impalad.split(":")
    except ValueError:
        host, port = impalad, DEFAULT_HS2_PORT
    cursor = None
    try:
        conn = connect(host=host,
                       port=port,
                       database=database,
                       auth_mechanism="GSSAPI" if use_kerberos else "NOSASL")
        cursor = conn.cursor()
        LOG.info("Connected to {0}:{1}".format(host, port))
    except Exception, e:
        LOG.error("Error connecting: {0}".format(str(e)))
Пример #22
0
def main():
    # CONNECTION TO DATABASE
    conn = connect(host='your hive db host Ip/hostname', port=10000, auth_mechanism='PLAIN', user='******', password='******',
                   database='xxx')
    c = conn.cursor()

    # QUERIES
    query_1 = """
      SELECT *
      FROM your_table
    """
    query_2 = """
        SELECT *
        FROM your_table_2 a
        where to_date(timestamp) = current_date
        group by x
    """

    # EXECUTE QUERIES

    # EXE TOTAL
    c.execute(query_1)
    result_1 = c.fetchall()

    # EXE TOTAL PER REGION
    c.execute(query_2)
    result_2 = c.fetchall()
    fields = ['field_1', 'field_2', 'field_3']
    # CONVERT DATA tuple TO dictionary	
    total_per_region = [dict(zip(fields, d)) for d in reg]

    # RETURN VALUES TO TEMPLATE
    return render_template('index.html', result1=result_1, result2=result_2)
Пример #23
0
 def __init__(self, host, port):
     try:
         self.conn = connect(host=host, port=port)
     except:
         error_msg = 'impala connect error: Cannot connect to server!'
         print(error_msg)
     self.cur = self.conn.cursor()
Пример #24
0
def data2hive():
    # 读取配置文件
    conf = ConfigParser.ConfigParser()
    conf.read('config.ini')
    ip = conf.get("hive", "ip")
    tablename = conf.get("hive", "tablename")
    db = conf.get("hive", "db")

    # 创建数据库连接
    conn = connect(host=ip, port=10000, database=db, auth_mechanism='PLAIN')
    # 创建cursor
    cursor = conn.cursor()

    try:
        sql = "select distinct questno,txt_content_url from " + tablename
        cursor.execute(sql)
        result = cursor.fetchall()
        count = 0
        if result is None:
            print("hive dose not have data!")
            return
        for row in result:
            queue_data.put(row)
            count += 1
            print('count:' + str(count))
    except Exception as ex:
        print(ex)
    finally:
        cursor.close()
        conn.close()
Пример #25
0
def search_info(request):
    if request.method == 'GET':
        return render(request,"TestModel/hello.html",{"hello":"pass"})
    elif request.method =='POST':
        try:
            #接收请求的信息
            info=request.body
            data4info = json.loads(info)
            sql=doJoinSql(data4info)
            conn = connect(host='130.10.7.108', port=21050)
            cursor = conn.cursor()
            cursor.execute('USE rk')
            cursor.execute(sql)
            #查询信息结果
            results = cursor.fetchall()
            print(results)
            ret={
                'msg': testmsg(results),
                'code': testcode(results),
                'data': results2json(data4info,results)
                }
            #return JsonResponse(ret,safe=False)
        except Exception as e:
            print("**************************************")
            print(e)
            print("**************************************")
            print("^上述异常发生了^")
        else:
            print("**************************************")
            print("一 切 OK")
            print("**************************************")
        finally:
            return JsonResponse(ret,safe=False)
    else:
        return "请求方法错误"
Пример #26
0
def connect_impala():
    conn = connect(host='172.31.2.214',
                   port=25000,
                   user='******',
                   password='******')
    cur = conn.cursor(user='******')
    return cur
Пример #27
0
    def __init__(self,host = '10.18.0.19',port=21050):
        #logging.basicConfig(level=logging.INFO)
        logger.info(host)
        logger.info(port)
        self.conn = connect(host=host,port=port,database='default')

        self.cur = self.conn.cursor()
Пример #28
0
    def _getting_data_from_hive(self):
        log_instance.info("hive-1")
        # getting data from hive
        s1 = "select distinct shop_id, t.shop_name from tmp.tmp_offline_dianping_shop_all_category t %s" % \
             bc_config.cateName2WhereCondition_Dict[self.cate_name]
        inc_data_file = bc_config.cateName2Dir_Dict[
            self.cate_name] + "/inc_data/inc_data.txt"
        conn = None
        cur = None
        try:
            conn = connect(host='172.20.207.6',
                           port=10000,
                           auth_mechanism="PLAIN")
            cur = conn.cursor()
            log_instance.info("hive-2")
            log_instance.info("hive-sql: %s" % s1)
            cur.execute(s1)
            lst1 = []
            results = cur.fetchall()
            log_instance.info("hive-3")
            for d in results:
                lst1.append("%s\t%s" % (d[0], d[1].replace("\t", " ")))

            with open(inc_data_file, "w") as f1:
                f1.write("\n".join(lst1))
                f1.flush()
            log_instance.info("hive-4")
        except Exception as e:
            log_instance.error(traceback.format_exc())
            raise e
        finally:
            if cur != None: cur.close()
            if conn != None: conn.close()

        return 0
Пример #29
0
def run_remove_hive(conf: ConfigData, the_date: str, delta_day=0):
    f_date_str = StrTool.get_the_date_str(the_date, delta_day)  # "20181101"
    # "/user/hive/warehouse/rds_posflow.db/t1_trxrecprd_v2/t1_trxrecord_20181204_V2*.csv"

    del_table = conf.get_table_name(
    )  # hive_table="rds_posflow.t1_trxrecprd_v2"

    if the_conf.m_project_id == 1:
        del_file = conf.get_file_name(f_date_str).replace('.', '*.')
        MyHdfsFile.delete_hive_ssh(conf.get_data("cdh_ip"),
                                   table=del_table,
                                   p_name=del_file,
                                   username=conf.get_data("cdh_user"),
                                   password=conf.get_data("cdh_pass"))

    if the_conf.m_project_id == 2:
        conn = connect(host=conf.hive_ip(),
                       port=conf.hive_port(),
                       auth_mechanism=conf.hive_auth(),
                       user=conf.hive_user())
        cur = conn.cursor()

        # "ALTER TABLE rds_posflow.t1_trxrecprd_v2_tmp DROP IF EXISTS PARTITION(p_date=20190208) "
        sql = "ALTER TABLE {} DROP IF EXISTS PARTITION( p_date={} )".format(
            del_table, the_date)
        print(sql)
        cur.execute(sql)

        cur.close()
        conn.close()
Пример #30
0
def get_consumer_group_cheques(discount_card_id=_discount_card_id):
    conn = connect(host='node1.allende.bigkore.com', port=21050)
    cursor = conn.cursor()
    query = "select consumer_id, date1, group23, quantity from test.consumer_date_purchases_group23 where consumer_id='%s'" % (discount_card_id)
    print query
    cursor.execute(query)
    results = cursor.fetchall()
    results = [[str(r) for r in row] for row in results]
    conn.close()

    root_Node = id_name_Node('d=' + discount_card_id)
    date_node_dict = {}

    for row in results:
        date = row[1]
        date_node = None
        if date in date_node_dict:
            date_node = date_node_dict[date]
        else:
            date_node = id_name_Node('date=' + date)
            date_node_dict[date] = date_node
        good_node = id_name_Node('' + row[2])
        good_node.children = [
            id_name_Node('quantity=' + row[3]),
        ]
        date_node.children.append(good_node)

    root_Node.children += sorted(date_node_dict.values(), cmp=lambda x, y: - cmp(x.name, y.name))
    return json.dumps(_to_json(root_Node))
Пример #31
0
    def run_query(cls, host_name, table, query, op='select'):
        """Execute a impala query.
        Args:
            host_name: host name for impala connection
            table: database.table
            query: impala query
            op: indicator for select"""
        global IMPALA_CONN

        result = ''
        if not IMPALA_CONN:
            IMPALA_CONN = connect(host=host_name, port=25003, timeout=600,
                                  use_kerberos=True)
        cur = IMPALA_CONN.cursor()
        # Can't invalidate something that doesn't yet exist
        if op != "create":
            ImpalaConnect.invalidate_metadata(host_name, table)
        cur.execute(query, configuration={'request_pool': 'ingestion'})

        if cur:
            if op == 'select':
                result = cur.fetchall()
        else:
            raise "Hive connection - cursor is none"
        return result
Пример #32
0
def impala_query(sql):
    conn = connect(**impala_config)
    cur = conn.cursor()
    cur.execute(sql)
    df = as_pandas(cur)
    conn.close()
    return df
Пример #33
0
 def conn(self):
     return connect(host='192.168.1.66',
                    port=10000,
                    user='******',
                    password='******',
                    database='default',
                    auth_mechanism='PLAIN')
Пример #34
0
def create_connection():

    host, port = config.hive()
    conf = {}

    # TODO: if using hive, kerberos service name must be changed, impyla sets 'impala' as default
    conf.update({'kerberos_service_name': 'hive'})

    if config.kerberos_enabled():
        principal, keytab, sasl_mech, security_proto = config.kerberos()
        conf.update({
            'auth_mechanism': 'GSSAPI',
        })
    else:
        conf.update({
            'auth_mechanism': 'PLAIN',
        })

    if config.ssl_enabled():
        ssl_verify, ca_location, cert, key = config.ssl()
        if ssl_verify.lower() == 'false':
            conf.update({'use_ssl': ssl_verify})
        else:
            conf.update({'ca_cert': cert, 'use_ssl': ssl_verify})

    db = config.db()
    conn = connect(host=host, port=int(port), database=db, **conf)
    return conn.cursor()
 def conn():
     return connect(host='master.mycdh.com',
                    port=21050,
                    database='movie_shop',
                    timeout=60,
                    user='******',
                    password='******')
Пример #36
0
def update_raw_stage(output, delivery_tag):

    #context = zmq.Context()

    #confirm = context.socket(zmq.PUSH)
    #confirm.connect(confirm_host)

    hdfs = PyWebHdfsClient(host=webhdfs_host, port=webhdfs_port, user_name=webhdfs_user)
    impala_conn = connect(host=impala_host, port=int(impala_port))
    cur = impala_conn.cursor()

    start_time = time.time()

    for k, v in output.iteritems():

        if (time.time() - start_time)/60 > sink_minutes:
            sink_logger.warning('ETL process running longer then sink timeout: {0} minutes'.format((time.time() - start_time)/60))
        try:
            file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k)
            hdfs.append_file(file_name, '\n'.join(v))
            cur.execute('refresh test_log_{0}'.format(k))

        except hdfs_err.PyWebHdfsException:
            file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k)
            hdfs.create_file(file_name, '')
            hdfs.append_file(file_name, '\n'.join(v))
            cur.execute('refresh test_log_{0}'.format(k))

    #confirm.send(delivery_tag)
    sink_logger.info('ETL process finished for {0} minutes'.format((time.time() - start_time)/60))
    sink_logger.info('ETL process finished with {0} delivery_tag'.format(delivery_tag))
Пример #37
0
def run_sql(sql):
    conn = connect(host='172.17.69.25',
                   auth_mechanism='PLAIN',
                   port=21050,
                   user='******',
                   password='******')
    cursor = conn.cursor()
    cnt = 1

    # 2.2 对impala执行SQL查询
    if ';' in sql:
        sql_list = sql.rstrip().split(';')
        # print(type(sql_list))
        if len(sql_list[-1]):
            for s in sql_list:
                print("runing sql @ %s" % cnt)
                cursor.execute(s)
                cnt += 1
        else:
            sql_list.pop()
            for s in sql_list:
                print("runing sql @ %s" % cnt)
                cursor.execute(s)
                cnt += 1
    else:
        print("runing sql @ %s" % cnt)
        cursor.execute(sql)
    # 2.3 把结果转化成pandas的dataframe格式,以便进行数据分析
    # df = as_pandas(cursor)
    # print(df)
    return as_pandas(cursor) if cursor.description != None else 'null'
def impala_connect(sql, **kwargs):
    # impala
    host = kwargs.get("host", 'impala.bjds.belle.lan')
    port = kwargs.get("port", 21051)
    timeout = kwargs.get("timeout", 3600)
    # hive
    # host = kwargs.get("host", 'impala.bjds.belle.lan')
    # port = kwargs.get("port", 10008)
    # timeout = kwargs.get("timeout", 3600)
    user = kwargs.get("user", "lv.d.sz")
    password = kwargs.get("password", 'JHjLXpyQ')
    kerberos_service_name = kwargs.get("kerberos_service_name", "impala")
    conn = connect(host=host,
                   port=port,
                   timeout=timeout,
                   user=user,
                   password=password,
                   kerberos_service_name=kerberos_service_name,
                   auth_mechanism='LDAP')
    cur = conn.cursor(user=user)
    if sql is not None:
        cur.execute(sql)
        try:
            df = as_pandas(cur)
        except:
            return cur
    return df
Пример #39
0
def batching_data():
    input_date = '2015-01-03'

    # Connection to database
    conn = connect(host='localhost',
                   port=10000,
                   auth_mechanism='PLAIN',
                   user='******',
                   password='******',
                   database='gdelt_db')
    c = conn.cursor()

    # Avg per country from an initial date
    query = """
        SELECT ActionGeo_CountryCode, AVG(AvgTone)
        FROM gdelt_temporal
        where to_date(SQLDATE) >= {}
        group by ActionGeo_CountryCode
    """

    # Execution of query
    c.execute(query.format(input_date))
    result_set = c.fetchall()

    # From list to dictionary
    header = ['country', 'avg_tone']
    avg_per_country = [dict(zip(header, c)) for c in cou]

    return jsonify(result_set)
Пример #40
0
def run_hive(conf: ConfigData, the_date: str):
    client = Client(conf.hdfs_ip())  # "http://10.2.201.197:50070"
    conn = connect(host=conf.hive_ip(), port=conf.hive_port(), auth_mechanism=conf.hive_auth(), user=conf.hive_user())
    cur = conn.cursor()

    the_date = StrTool.get_the_date_str(the_date)  # "20181101"
    root_path = conf.get_data("hdfs_dir_zc")  # "/data/posflow/allinpay_utf8_zc/"
    file_ext3 = conf.get_data("file_ext3")  # _loginfo_rsp.txt          # 20181101_loginfo_rsp.txt
    file_ext4 = conf.get_data("file_ext4")  # _loginfo_rsp_agt.txt      # 20181101_loginfo_rsp_agt.txt
    file_ext5 = conf.get_data("file_ext5")  # _rxinfo_rsp.txt           # 20181101_rxinfo_rsp.txt
    file_ext6 = conf.get_data("file_ext6")  # _rxinfo_rsp_agt.txt       # 20181101_rxinfo_rsp_agt.txt

    print("Start\n")

    file3 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext3))
    file4 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext4))
    file5 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext5))
    file6 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext6))

    f_list = [file3,file4,file5,file6]
    t_list = ["hive_table3", "hive_table4", "hive_table5", "hive_table6"]

    for n in range(0,4):
        if MyHdfsFile.isfile(client, f_list[n]):
            sql = 'LOAD DATA INPATH \'' + f_list[n] + '\' INTO TABLE ' + conf.get_data(t_list[n])  # 'test.t1_trxrecprd_v2_zc'
            # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2'
            print("OK" + "  " + sql+"\n")
            cur.execute(sql)  # , async=True)

    cur.close()
    conn.close()
Пример #41
0
def hive_query(sql):
    conn = connect(**hive_config)
    cur = conn.cursor()
    cur.execute(sql)
    data = cur.fetchall()
    conn.close()
    return pd.DataFrame(data)
Пример #42
0
 def open_connection(self):
     try:
         self.connection = connect(host=self.host, port=self.port)
         logger.info('Connected to HiveServer2 at %s:%s' % (self.host, self.port))
     except TTransportException:
         logger.error('Unable to open connection to HiveSever2 at %s:%s' % (self.host, self.port))
         sys.exit(2)
     self.cursor = self.connection.cursor()
def get_impala_connection():
    while True:
        try:
            conn = connect(host='al1.zmeke.com', port=21050)
            return conn
        except TTransportException as e:
            print e.message
            time.sleep(1)
            pass
Пример #44
0
 def connect(self, host, port):
     '''
     :param host: Impala server hostname
     :param port: Impala server port
     :return: impyla cursor
     '''
     conn = connect(host=self.host, port=self.port)
     cursor = conn.cursor()
     return cursor
Пример #45
0
def main():
    logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', filename='/etl/logs/install_active_log.log', level=logging.DEBUG)
    query_time = get_query_time()
    logging.info('Time for query: {0}'.format(query_time))

    cnx = sql.connect(user='******', password="******", host='localhost', port=3301, database='matracking')
    cursor = cnx.cursor()
    delete_query = 'delete from install_active_log where active_timestamp > {0}'.format(query_time)
    cursor.execute(delete_query)
    logging.info('Rows deleted')
    logging.info('New max time: {0}'.format(get_new_max_time()))

    conn = connect(host='h1.stats.zmeke.com', port=21050)
    cur = conn.cursor()
    cur.execute("select ios_ifa, google_aid, windows_aid, unix_timestamp(p_tdate) as active_timestamp, installid, site_id, publisher_id, campaign_id, created"
                " from "
                "(select distinct installid, case when ios_ifa is null or ios_ifa='' or ios_ifa='NF' then "
                " upper(google_aid) else upper(ios_ifa) end as id, ios_ifa, google_aid, windows_aid, site_id, publisher_id, campaign_id, created,"
                " case when ios_ifa is null or ios_ifa='' or ios_ifa='NF' then "
                " 'andr' else 'ios' end as type "
                " from analytics.installs where campaign_id<>'0') t1"
                " inner join "
                " (select distinct case when device_ids_idfa_am2 is null or device_ids_idfa_am2='' or device_ids_idfa_am2='NF' then "
                " upper(device_ids_google_aid_am2) else upper(device_ids_idfa_am2) end as id, p_tdate "
                " from r12.log_1 "
                " where unix_timestamp(p_tdate)>={0}) t2"
                " on t1.id=t2.id".format(query_time))

    impala_data = cur.fetchall()
    logging.info('Fetched rows from impala: {0}'.format(len(impala_data)))

    count = 0
    numb = 0
    add_row = ''

    for i in impala_data:

        if numb < 1000:
            count += 1
            numb += 1
            add_row += "INSERT INTO matracking.install_active_log (ios_ifa, google_aid, windows_aid, active_timestamp, installid, site_id, publisher_id, campaign_id, created) VALUES %s; " % str(i)
        else:
            for result in cursor.execute(add_row, multi=True):
                pass
            cnx.commit()
            count += 1
            numb = 1
            add_row = "INSERT INTO matracking.install_active_log (ios_ifa, google_aid, windows_aid, active_timestamp, installid, site_id, publisher_id, campaign_id, created) VALUES %s; " % str(i)

    for result in cursor.execute(add_row, multi=True):
        pass

    cnx.commit()
    cursor.close()
    cnx.close()
    logging.info('Rows inserted: {0}'.format(count))
Пример #46
0
def con(host, port, auth_mech, tmp_db):
    # create the temporary database
    con = connect(host=host, port=port, auth_mechanism=auth_mech)
    cur = con.cursor()
    cur.execute("CREATE DATABASE {0}".format(tmp_db))
    cur.close()
    con.close()

    # create the actual fixture
    con = connect(host=host, port=port, auth_mechanism=auth_mech, database=tmp_db)
    yield con
    con.close()

    # cleanup the temporary database
    con = connect(host=host, port=port, auth_mechanism=auth_mech)
    cur = con.cursor()
    force_drop_database(cur, tmp_db)
    cur.close()
    con.close()
Пример #47
0
Файл: ConEct.py Проект: P1R/RHPy
    def db_connect(self):
        '''Funcion que conecta a hive con datos config.py regresando
        conector y cursor'''

        conn = connect(host=DbD['host'],
                port=DbD['port'],
                user=DbD['user'],
                password=DbD['password'])
        cursor = conn.cursor()
        return conn, cursor
Пример #48
0
 def _execute_actions(self):
     with dbapi.connect(host=self.server,
                        port=self.port,
                        user=self.user,
                        database=self.database,
                        auth_mechanism="PLAIN") as conn:
         cursor = conn.cursor()
         for action in self.hql_actions:
             logger.debug("Executing hql action:\n{0}\n".format(action))
             cursor.execute(action)
 def run(self):
     cnx = connect(host=self.host, port=self.port)
     cursor = cnx.cursor()
     try:
         while True:
             work_item = self.q.get(block=False)
             self.do_work(cursor, work_item)
     except Empty:
         pass
     finally:
         cnx.close()
Пример #50
0
 def get_impala_connection(self):
     while True:
         try:
             impala_conn = connect(host=impala_host, port=int(impala_port))
             cur_impala = impala_conn.cursor()
             cur_impala.execute("refresh default.fail_test;")
             return impala_conn
         except TTransportException as e:
             logging.exception("Failed to connect to Impala")
             logging.exception(e.message)
             time.sleep(10)
Пример #51
0
 def get_impala_connection(self):
     while True:
         try:
             impala_conn = connect(host=self.impala_host, port=int(self.impala_port))
             cur_impala = impala_conn.cursor()
             cur_impala.execute('refresh default.fail_test;')
             return impala_conn
         except TTransportException as e:
             self.sink_logger.exception("""Failed to connect to Impala""")
             self.sink_error_logger.exception("""Failed to connect to Impala with error {error}""".format(error=e.message))
             time.sleep(10)
Пример #52
0
 def get_impala_connection(self):
     while True:
         try:
             impala_conn = connect(host=impala_host, port=int(impala_port))
             cur_impala = impala_conn.cursor()
             cur_impala.execute('refresh default.fail_test;')
             return impala_conn
         except TTransportException as e:
             sink_logger.exception('Failed to connect to Impala')
             sink_error_logger.exception(e.message)
             time.sleep(10)
Пример #53
0
	def get(self):
	    conn = connect(host = host_machine_ip_address, port=21050)
	    cur = conn.cursor()
	    command = self.get_argument('query')
	    if "drop" in command.lower():
	    	return
	    cur.execute(command.encode('ascii','ignore').replace(";", ""))
	    data = cur.fetchall()
	    print data
	    code = html.prettyDatabase(data)
	    json_data = {"data": data, "code": code, "query": command}
	    self.write(json_data)
Пример #54
0
    def get_impala_connection(self):
        try:
            impala_conn = connect(host=self.impala_host, port=int(self.impala_port), timeout=30)
            cur_impala = impala_conn.cursor()
            cur_impala.execute('refresh default.fail_test;')

            self.sink_logger.info('Connected to Impala node {0}'.format(self.impala_host))
            return True, impala_conn, cur_impala
        except TTransportException as e:
            self.sink_logger.exception("""Failed to connect to Impala: {0}""".format(self.impala_host))
            self.sink_error_logger.exception("""Failed to connect to Impala with error {error}""".format(error=e.message))
            try:
                impala_conn = connect(host=self.impala_host_reserve, port=int(self.impala_port), timeout=30)
                cur_impala = impala_conn.cursor()
                cur_impala.execute('refresh default.fail_test;')
                self.sink_logger.info('Connected to Impala node {0}'.format(self.impala_host_reserve))
                return True, impala_conn, cur_impala
            except TTransportException as e:
                self.sink_logger.exception("""Failed to connect to Impala: {0}""".format(self.impala_host_reserve))
                self.sink_error_logger.exception("""Failed to connect to Impala with error {error}""".format(error=e.message))
                return False, False, False
Пример #55
0
def desc_total_sales_volumn(year):
    
    # Redis read cache value
    REDIS_KEY = "desc_total_sales_vol:{0}".format(year)
    cached_data = redis_io.read_transaction(REDIS_KEY)
    
    if cached_data != None:
        return ast.literal_eval(cached_data)
    #
    
    conn = connect(host='salest-master-server', port=21050)
    cur = conn.cursor()

    # daily transaction agg
    cur.execute('USE salest')
    
    bIsRealTimeUpdated = REFRESH_IMPALA_TABLE(cur, year)
        
    cur.execute("""
        SELECT year_month_day, SUM(num_of_product) AS num_of_product, SUM(sales_amount) AS total_amount
        FROM (
            SELECT SUBSTR(date_receipt_num,1,10) AS year_month_day, num_of_product, sales_amount
            FROM """ + GET_IMPALA_TB_NAME(year) + """ WHERE SUBSTR(date_receipt_num,1,4) = '""" + year +
        """'
        ) view_tr_recipt
        GROUP BY year_month_day ORDER BY year_month_day ASC
        """
    )

    df_tr_agg_daily = as_pandas(cur)
    conn.close()
    
    series_sum = df_tr_agg_daily[['num_of_product','total_amount']].sum()
    series_sum.name = 'sum'

    df_desc = df_tr_agg_daily.describe().append(series_sum)
    df_desc['num_of_product'] = df_desc['num_of_product'].apply(lambda v: round(v))
    df_desc['total_amount'] = df_desc['total_amount'].apply(lambda v: round(v))
    
    df_desc.fillna(0, inplace=True)
    
    cached_data = df_desc.to_dict()
    
    if bIsRealTimeUpdated == False:
        # Redis save cache value
        redis_io.write_transaction(REDIS_KEY, cached_data)
        #
        cached_data = redis_io.read_transaction(REDIS_KEY)
        cached_data = ast.literal_eval(cached_data)
    
    return cached_data
Пример #56
0
    def _new_cursor(self):
        params = self.params.copy()
        con = impyla_dbapi.connect(database=self.database, **params)

        # make sure the connection works
        cursor = con.cursor()
        cursor.ping()

        wrapper = ImpalaCursor(cursor, self, self.database)

        if self.codegen_disabled:
            wrapper.disable_codegen(self.codegen_disabled)

        return wrapper
Пример #57
0
def get_timebase_data_on_past_specific_date(cur_date):
    
    # Redis read cache value
    REDIS_KEY_PREFIX = "past_timebase_data_of"
    
    def get_cache_value(cur_date):
        return redis_io.read_dict_transaction(REDIS_KEY_PREFIX + ":" + cur_date) 
    
    cached_data = get_cache_value(cur_date)
    if cached_data != None:
        return cached_data
    
    
    conn = connect(host='salest-master-server', port=21050)
    cur = conn.cursor()

    cur.execute('USE salest')
    
    date_list = tuple(get_past_target_date(cur_date))

    cur.execute(
    """
        SELECT time_hour, CAST(SUM(sales_amount) as INTEGER) AS total_amount, 
        COUNT(sales_amount) as num_of_transaction,
        COUNT(DISTINCT year_month_day) as date_count
        FROM(
            SELECT SUBSTR(date_receipt_num,1,10) AS year_month_day,
            SUBSTR(tr_time,1,2) AS time_hour,
            sales_amount
            FROM ext_tr_receipt WHERE SUBSTR(date_receipt_num,1,10) IN ('%s')
            """ % date_list +
            """
        ) view_tr_total_amount_by_dayofweek
        GROUP BY time_hour ORDER BY time_hour ASC
        """
    )
    df_by_hour = as_pandas(cur)
    conn.close()
    
    df_by_hour.set_index('time_hour',inplace=True)
    df_by_hour = df_by_hour.reindex([[str(i) for i in np.arange(10,24)]],fill_value=0)

    dict_result = df_by_hour['total_amount'].to_dict()
    dict_result['date'] = date_list[0]
     
    redis_io.write_dict_transaction(REDIS_KEY_PREFIX + ":" + cur_date, dict_result, 60*60)
    
    ret_dict = get_cache_value(cur_date)

    return ret_dict
 def run_testing(self):
     conn = connect(host='172.168.0.24', port=21050,database = 'tpcds_text_150')
     cursor = conn.cursor()
     count = 0
     cur_task = 0
     query_time = []
     with open(os.path.join(self.profile_path,'RESULT'),'w') as fp_w:
         for query in self.query_list:
             if query not in self.des_dic.keys():
                  self.des_dic[query] = {}
             for cpufreq in self.cpufreq_range:
                 if cpufreq not in self.des_dic[query].keys():
                     self.CPUFreqSet_OBJ.set(cpufreq)
                     self.des_dic[query][cpufreq] = []
                 start_time = time.time()
                 #for i in range(self.every_query_times):
                 
                 with open(os.path.join(local_config()['query_dir'],query),'r') as fp:
                      sql= fp.read().strip('profile;\n')
                      sql = sql.strip('; ')
                 while True:
                     query_start_time = time.time()
                     cursor.execute('%s'%(sql))
                     end_time = time.time()
                     query_time.append(end_time - query_start_time)
                     while True:
                         row=cursor.fetchone()
                         if row:
                             pass
                         else:
                             break
                     
                     cur_profile = cursor.get_profile()
                     
                     count = count + 1
                     
                     with open(os.path.join(self.profile_path,str(count)+'.log'),'w') as fp_profile:
                         fp_profile.write("%s"%(cur_profile))
                     
                     if end_time - start_time > local_config()['duration_time'] :
                         break
                     
                 self.des_dic[query][cpufreq].sort()
         print "%s,%s,%s"%(self.users_InTotal,self.user_name,count)
         #print >>self.logging,"%s,%s,%s"%(self.profile_path,self.user_name,count)
         fp_w.write("%s,%s,%s,%s"%(self.users_InTotal,self.user_name,count, query_time))
     cursor.close()
     conn.close()
     return self.des_dic
Пример #59
0
 def __init__(self, temp_dir=None, temp_db=None, nn_host=None,
         webhdfs_port=50070, hdfs_user=None, *args, **kwargs):
     # args and kwargs get passed directly into impala.dbapi.connect()
     suffix = _random_id(length=8)
     self._temp_dir = '/tmp/impyla-%s' % suffix if temp_dir is None else temp_dir
     self._temp_db = 'tmp_impyla_%s' % suffix if temp_db is None else temp_db
     self._conn = connect(*args, **kwargs)
     self._cursor = self._conn.cursor()
     # used for pywebhdfs cleanup of temp dir; not required
     self._nn_host = nn_host
     self._webhdfs_port = webhdfs_port
     self._hdfs_user = hdfs_user
     if temp_db is None:
         self._cursor.execute("CREATE DATABASE %s LOCATION '%s'" %
                 (self._temp_db, self._temp_dir))
 def connect(self):
   LOG.info("-- connecting to {0} with impyla".format(self.__host_port))
   host, port = self.__host_port.split(":")
   self.__impyla_conn = impyla.connect(host=host, port=int(port))
   LOG.info("Conn {0}".format(self.__impyla_conn))
   # Get the default query options for the session before any modifications are made.
   self.__cursor = self.__impyla_conn.cursor()
   self.__cursor.execute("set all")
   self.__default_query_options = {}
   for name, val, _ in self.__cursor:
     self.__default_query_options[name] = val
   self.__cursor.close_operation()
   LOG.debug("Default query options: {0}".format(self.__default_query_options))