Exemplo n.º 1
0
jpype.startJVM(
    JVM_path, "-ea",
    "-Djava.class.path=C:\\Users\\CBH\\IdeaProjects\\HiveUtil\\target\\HiveUtil-1.0-SNAPSHOT.jar"
)

# 打印hello, word
jpype.java.lang.System.out.println("hello World")

TA = jpype.JPackage('com.sbh.tj').ConnectorHiveUtil

TA = TA()
a = TA.GetHiveConnctor('jdbc:hive2://192.168.8.10:10000/default',
                       'show databases')
a = TA.newGetCon()
# 关闭JVM
jpype.shutdownJVM()

#python 三方包连接hive

from pyhive import hive
conn = hive.Connection(host="192.168.8.10",
                       port='10000',
                       username='******',
                       password='******',
                       database='default',
                       auth="LDAP")
cursor = conn.cursor()
cursor.execute("select * from xxx")
result = cursor.fetchall()
df = pd.DataFrame(list(result))
 def __getConnection(self):
     conn = hive.Connection(host = self.host_name, port = self.port, database = self.database, auth = 'NOSASL')
     return conn;
Exemplo n.º 3
0
from pyhive import hive
import pandas as pd

# Create HIVE Connection
# Student is created Data base in hive
conn = hive.Connection(host="localhost",
                       port=10000,
                       username="******",
                       database="CRUD",
                       auth="NOSASL")
cursor = conn.cursor()
# join operation
joinOPeration = """Select address, firstTable.name, firstTable.age from secondTable JOIN firstTable ON(secondTable.id=firstTable.id)"""
cursor.execute(joinOPeration)
data = pd.read_sql(joinOPeration, conn)
print(data)
Exemplo n.º 4
0
from pyhive import hive
import pandas as pd
import sys
# Establish connection between hive server and database

conn = hive.Connection(host="localhost", port=10000, database="logs", auth="NOSASL")

try:
    query= pd.read_sql('select * from cpulogs', conn)  #Load Dtabase into hive


    late_commers = query[query['cpulogs.start_time'] > '2019-10-24 09:30:00']  # Getting dataframe whose start time is above 9:30 AM

    late_commers_usernames = late_commers['cpulogs.user_name']  # Getting user names only

    print(late_commers_usernames)
except:
    print("Syntax error")
directory = 'BL/'

from os import walk
import pandas as pd

categories = pd.DataFrame()
for (dirpath, dirnames, filenames) in walk(directory):
    if (len(dirnames) == 0):
        category = dirpath[len(directory):]
        print(category)

        filepath = dirpath + "/" + "domains"
        with open(filepath) as f:
            content = f.readlines()
        content = [x.strip() for x in content]
        content = pd.DataFrame(content)
        content['category'] = category
        content.columns = ['domain', 'category']
        categories = pd.concat([categories, content])

        #print(content)

categories

from pyhive import hive
conn = hive.Connection(host="YOUR_HIVE_HOST", port="PORT", username="******")

import pandas as pd
df = pd.read_sql("SELECT cool_stuff FROM hive_table", conn)
Exemplo n.º 6
0
Arquivo: sql.py Projeto: mnms/share
 def _execute(self, text):
     ip, port = sql_util.get_thrift_addr()
     conn = hive.Connection(host=ip, port=port)
     cursor = conn.cursor(arraysize=1000)
     result = cursor.execute(text)
     return result, cursor
Exemplo n.º 7
0
from pyhive import hive
import pandas
import sys
import ssl
import thrift
import thrift_sasl
import json
from datetime import datetime as dt
import numpy as np
import matplotlib.pyplot as plt

#instantiate connection to hive via server2, using pyhive library
#use connection to read hive table and store as pandas dataframe

connection = hive.Connection(host="localhost",
                             port=10000,
                             username='******')
dataframe = pandas.read_sql("SELECT * FROM igdbtables.topgames", connection)

#convert dataframe to usuable dictionary format, for processing

games = []
genreDataSet = []
dataframe.to_dict()
limit = 500

for i in range(0, limit):
    game = {}
    game['id'] = int(dataframe['topgames.id'][i])
    game['name'] = dataframe['topgames.name'][i]
    theseGenres = dataframe['topgames.genres'][i]
Exemplo n.º 8
0
from pyhive import hive

"""
使用kerberos认证,连接程序需要安装客户端
"""

conn = hive.Connection(host='master1-de.pagod.com.cn', port=10000, auth="KERBEROS", kerberos_service_name="hive", username='******', database='default')
cursor = conn.cursor()
cursor.execute('show tables')

for result in cursor.fetchall():
    print(result)
Exemplo n.º 9
0
from pyhive import hive
import pandas as pd
import sys
# Establish the connection between hive server and Database
conn = hive.Connection(host="localhost",
                       port=10000,
                       username="******",
                       database="logs",
                       auth="NOSASL")

try:
    # load database into hive
    query = pd.read_sql('select * from workinglogs', conn)
    # convert the data into the format of datetime
    query['workinglogs.working_hours'] = pd.to_datetime(
        query['workinglogs.working_hours'])
    # calculate the mean of total working_hours
    lowest_avghour_log = query[query['workinglogs.working_hours'] <
                               query['workinglogs.working_hours'].mean()]
    # print(avghour)
    lowest_avghour_log.to_csv("data/lowest_avghours_user_log.csv", index=False)
    # print the user_name with lowest average hours
    # LOWEST_AVG_HOURS =avghour['workinglogs.user_name']
    # print(LOWEST_AVG_HOURS)
except:
    print("Syntax error")
"""


def date_tuple(date_string):
    lst = date_string.split('-')
    return (datetime.date(int(lst[0]), int(lst[1]), int(lst[2])))


def gen_maturity_date(matu):
    regex = re.compile(r'\.|、')
    splt = regex.split(matu)
    return datetime.date(datetime.datetime.now().year, int(splt[0]),
                         int(splt[1]))


conn = hive.Connection(host="202.120.38.90", port=10086, auth="NOSASL")
cursor = conn.cursor()
# cursor.execute("select DISTINCT dated_date from bonds")
# for result in cursor.fetchall():
#     print(result)
cursor.execute(
    "select denomination,issue_start_date,dated_date,expiration_date,repayment_method,APR,bond_id from bonds"
)
total = 0
f = open("tmp.txt", mode='w', encoding='utf-8')
l = []
for result in cursor.fetchall():
    s = ""
    for item in result:
        s += str(item) + "\t"
    l.append(s + "\n")
Exemplo n.º 11
0
from pyhive import hive
from TCLIService.ttypes import TOperationState
import pandas as pd

cursor = hive.connect(host='localhost', port=10000, username='******').cursor()
cursor.execute("CREATE EXTERNAL TABLE IF NOT EXISTS cc(c_code int,country string)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'")
cursor.execute("LOAD DATA LOCAL INPATH '/home/student/Desktop/CCEE_Final_Project/Dataset/country_code.csv' OVERWRITE INTO TABLE cc")
#cursor.execute("CREATE EXTERNAL TABLE IF NOT EXISTS zomato_project_3(Restaurant_ID string,Restaurant_Name string,Country_Code INT,City string,Address string,Locality string,Locality_Verbose string,Longitude string,Latitude string,Cuisines string,Average_Cost_for_two string,Currency string,Has_Table_booking string,Has_Online_delivery string,Is_delivering_now string,Switch_to_order_menu string,Price_range int,Aggregate_rating string,Rating_color string,Rating_text string,Votes string)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'")
#cursor.execute("LOAD DATA LOCAL INPATH '/home/student/Desktop/CCEE_Final_Project/Dataset/Zomato.csv' OVERWRITE INTO TABLE zomato_project_3")


# In[51]:


import pandas as pd
conn = hive.Connection(host='localhost', port=10000, username='******')
df = pd.read_sql("select * from zomato_project_3", conn)


# In[52]:


df.head()


# In[53]:


df.rename(index=str, columns={"zomato_project_3.restaurant_id":"Restaurant ID",
                              "zomato_project_3.restaurant_name":"Restaurant Name",
                              "zomato_project_3.country_code":"Country Code",
Exemplo n.º 12
0
                    datetime.timedelta(1)).strftime('%Y%m%d')

    # Read connection info from config file
    config = configparser.ConfigParser()
    config.read('connection.cfg')

    hive_conn = config['hive']
    hive_host = hive_conn['host']
    hive_port = int(hive_conn['port'])
    hive_user = hive_conn['user']

    path_cfg = config['path']
    output_path = path_cfg['output']

    # Setup hive connection and execute query
    conn = hive.Connection(host=hive_host, port=hive_port, username=hive_user)
    cursor = conn.cursor()
    cursor.execute("use " + dbName)
    cursor.execute("show tables", async=True)

    status = cursor.poll().operationState
    while status in (TOperationState.INITIALIZED_STATE,
                     TOperationState.RUNNING_STATE):
        logs = cursor.fetch_logs()
        for message in logs:
            print(message)

        # If needed, an asynchronous query can be cancelled at any time with:
        # cursor.cancel()

        status = cursor.poll().operationState
Exemplo n.º 13
0
def run(cfg):
    conn = hive.Connection(host='10.213.37.46',
                           username='******',
                           password='******',
                           auth='CUSTOM')
    cursor = conn.cursor()
    cursor.execute('select * from dlpm_11092020_model_stat')
    stat_model = cursor.fetchone()
    model_info = json.loads(stat_model[0])
    stat_info = json.loads(stat_model[1])

    names = []
    tfrecord_location = cfg['tfrecords_local_path']
    for file in os.listdir(tfrecord_location):
        if file.startswith("part"):
            names.append(file)
    file_paths = [os.path.join(tfrecord_location, name) for name in names]

    # read and make the dataset from tfrecord
    dataset = tf.data.TFRecordDataset(file_paths)
    dataset = dataset.map(__data_parser)

    batch_size = cfg['batch_size']
    duration = cfg['duration']

    dataset = dataset.batch(batch_size).shuffle(SHUFFLE_BUFFER)
    iterator = dataset.make_one_shot_iterator()
    next_el = iterator.get_next()

    # lagged_ix = numpy.ones((duration, 4), dtype=float)
    # lagged_ix = np.where(lagged_ix == 1, -1, lagged_ix)
    lagged_ix = np.stack(lag_indexes(model_info), axis=-1)
    # quarter_autocorr = numpy.ones((batch_size,), dtype=float)

    date_list = model_info['days']
    dow = get_dow(date_list)

    holiday_list = cfg['holidays']

    holidays = [1 if _ in holiday_list else 0 for _ in date_list]
    a_list = []
    b_list = []
    for _ in holidays:
        a, b = holiday_norm(_)
        a_list.append(a)
        b_list.append(b)
    holiday = (a_list, b_list)

    with tf.Session() as sess:

        x = sess.run(next_el)
        quarter_autocorr = numpy.ones((x[0].size, ), dtype=float)
        page_indx = list(x[0])

        fill_isolated_zeros(x[21])
        tensors = dict(
            hits=pd.DataFrame(x[21], index=page_indx, columns=date_list),
            lagged_ix=lagged_ix,
            page_ix=page_indx,
            pf_age=pd.DataFrame(x[8:15],
                                columns=page_indx,
                                index=(1, 2, 3, 4, 5, 6, 7)).T,
            pf_si=pd.DataFrame(x[20], index=page_indx),
            pf_network=pd.DataFrame(x[15:20],
                                    columns=page_indx,
                                    index=('2G', '3G', '4G', 'UNKNOWN',
                                           'WIFI')).T,
            pf_price_cat=pd.DataFrame(x[1:4],
                                      columns=page_indx,
                                      index=('pc1', 'pc2', 'pc3')).T,
            pf_gender=pd.DataFrame(x[4:8],
                                   columns=page_indx,
                                   index=('none', 'f', 'm', 'x')).T,
            page_popularity=x[22],
            # page_popularity = quarter_autocorr,
            quarter_autocorr=quarter_autocorr,
            dow=pd.DataFrame(dow).T,
            holiday=pd.DataFrame(holiday).T)

        data_len = tensors['hits'].shape[1]
        plain = dict(data_days=data_len - cfg['add_days'],
                     features_days=data_len,
                     data_start=date_list[0],
                     data_end=date_list[-1],
                     features_end=date_list[-1],
                     n_pages=batch_size)
        VarFeeder(cfg['data_dir'], tensors, plain)
Exemplo n.º 14
0
#!/bin/env python
# -*- coding: UTF-8 -*-


from pyhive import hive

conn = hive.Connection(host='192.168.137.130', port=10000, username='******', database='default')
cursor = conn.cursor()
sql =   "select sum(Proc_Rec_Total_Qty),sum(All_Dupl_Rec_Qty),sum(Pk_Dupl_Rec_Qty) from ( "\
        "select count(1) as Proc_Rec_Total_Qty ,0 as All_Dupl_Rec_Qty,0 as Pk_Dupl_Rec_Qty from cls_db_3.lwq_test_tb where data_dt='2019-05-03' and tags is not null  "\
        "union "\
        "select 0 as Proc_Rec_Total_Qty ,count(1) as All_Dupl_Rec_Qty,0 as Pk_Dupl_Rec_Qty from isu_db_3.lwq_test_tb where data_dt='2019-05-03' and isu_type='1' "\
        "union "\
        "select 0 as Proc_Rec_Total_Qty,0 as All_Dupl_Rec_Qty,count(1) as Pk_Dupl_Rec_Qty from isu_db_3.lwq_test_tb where data_dt='2019-05-03' and isu_type='3' "\
        ") t"
cursor.execute(sql)
result = cursor.fetchone()
if result != None:
    print(result)
    Proc_Rec_Total_Qty = result[0]
    All_Dupl_Rec_Qty = result[1]
    Pk_Dupl_Rec_Qty = result[2]


Exemplo n.º 15
0
from pyhive import hive

conn = hive.Connection(host='localhost',
                       port=10002,
                       username='******',
                       database='test')
cursor = conn.cursor()
sql = "select dst12 from traffic_matrices where srcid='12' limit 10;"
cursor.execute(sql)
str = " "
for result in cursor.fetchall():
    print(str.join(result))
Exemplo n.º 16
0
from pyhive import hive

conn = hive.Connection(host='192.168.109.172',
                       port=10000,
                       auth='NOSASL',
                       username='******',
                       database='default')
cursor = conn.cursor()
cursor.execute('select * from link_expenses limit 1')
for result in cursor.fetchall():
    print(result)
conn.close()
Exemplo n.º 17
0
from pyhive import hive
from flask import Flask, jsonify
from flask import Flask, request
from flask_restful import Resource, Api

#=======================================================================================================================
#Establish connection with hive using Pyhive
#=======================================================================================================================

app = Flask(__name__)
db = hive.Connection(host="<hive host>",port=<default port>,database="default")
cursor = db.cursor()

#=======================================================================================================================
#Product Rest Endpoints Definition
#=======================================================================================================================

@app.route('/products', methods = ['GET'])

def products():
    if 'benefit' in request.args:
            getbenefit = request.args['benefit']
            splitbenefit = getbenefit.split(",")
            sql_query = '''select * from dev.products_sample where benefit in ('%s')''' % ("','".join(splitbenefit))
            cursor.execute(sql_query)
            content = [dict((cursor.description[i][0], value)
                      for i, value in enumerate(row)) for row in cursor.fetchall()]
            return jsonify({'myCollection': content})

    elif 'id' in request.args:
            getid = request.args['id']
Exemplo n.º 18
0
def connect_to_hive(hive_user):
    hive_host = '10.11.12.144'
    hive_conf = {'job.queue.name': 'default'}
    return hive.Connection(host=hive_host,
                           username=hive_user,
                           configuration=hive_conf)
Exemplo n.º 19
0
from pyhive import hive
conn = hive.Connection(host="localhost", port=50070, username="******")
Exemplo n.º 20
0
# ## load data (from Hive)
from pyhive import hive
import pandas as pd

conn = hive.Connection(host='mlamairesse-training-1.vpc.cloudera.com',
                       port=10000,
                       auth='KERBEROS',
                       kerberos_service_name='hive')
airlines_pd_df = pd.read_sql('select * from flights.airports', conn)
airlines_pd_df.set_index('airports.iata', inplace=True)

# ##lookup function


def lookup(arg: dict):
    code = arg['code'].upper()
    return airlines_pd_df.loc[code, :].to_dict()


lookup({"code": "WRL"})