def make_connection(): print(ENV) ic = ibis.impala_connect(host=ENV.impala_host, port=ENV.impala_port, protocol=ENV.impala_protocol) hdfs = ibis.hdfs_connect(host=ENV.nn_host, port=ENV.webhdfs_port) client = ibis.make_client(ic, hdfs_client=hdfs) return client
def test_create_table_with_location(self): base = pjoin(self.tmp_dir, util.guid()) name = 'test_{0}'.format(util.guid()) tmp_path = pjoin(base, name) # impala user has trouble writing to jenkins-owned dir so here we give # the tmp dir 777 superuser_hdfs = ibis.hdfs_connect(host=ENV.nn_host, port=ENV.webhdfs_port, auth_mechanism=ENV.auth_mechanism, verify=(ENV.auth_mechanism not in ['GSSAPI', 'LDAP']), user=ENV.hdfs_superuser) superuser_hdfs.mkdir(base) superuser_hdfs.chmod(base, '777') expr = self.alltypes table_name = _random_table_name() self.con.create_table(table_name, expr=expr, path=tmp_path, database=self.test_data_db) self.temp_tables.append('.'.join([self.test_data_db, table_name])) assert self.hdfs.exists(tmp_path)
def get_database_sqlalchemy_conn(database_type, host, port, database, user=None, password=None, **kwargs): if database_type == 'greenplum': conn = sqlalchemy.create_engine( 'postgresql+psycopg2://{}:{}@{}:{}/{}'.format( user, password, host, port, database)) elif database_type == 'mysql': conn = sqlalchemy.create_engine( 'mysql+pymysql://{}:{}@{}:{}/{}'.format(user, password, host, port, database)) elif database_type == 'hive': hdfs_host = kwargs.get('hdfs_host', host) hdfs_port = kwargs.get('hdfs_port', 50070) auth_mechanism = kwargs.get('hive_auth_mechanism', 'PLAIN') hdfs_client = ibis.hdfs_connect(host=hdfs_host, port=hdfs_port) conn = connect(host, port, auth_mechanism=auth_mechanism, database=database, hdfs_client=hdfs_client, user=user, password=password) else: return None return conn
def hdfs_client(env): return ibis.hdfs_connect( host=env.nn_host, port=int(env.webhdfs_port), auth_mechanism=env.auth_mechanism, user=env.webhdfs_user, )
def make_ibis_client(): hc = ibis.hdfs_connect(host=ENV.nn_host, port=ENV.webhdfs_port, auth_mechanism=ENV.auth_mechanism, verify=(ENV.auth_mechanism not in ['GSSAPI', 'LDAP'])) if ENV.auth_mechanism in ['GSSAPI', 'LDAP']: print("Warning: ignoring invalid Certificate Authority errors") return ibis.impala.connect(host=ENV.impala_host, port=ENV.impala_port, auth_mechanism=ENV.auth_mechanism, hdfs_client=hc)
def setUpClass(cls): cls.ENV = ENV cls.tmp_dir = pjoin(cls.ENV.tmp_dir, util.guid()) if cls.ENV.use_kerberos: print("Warning: ignoring invalid Certificate Authority errors") cls.hdfs = ibis.hdfs_connect(host=cls.ENV.nn_host, port=cls.ENV.webhdfs_port, use_kerberos=cls.ENV.use_kerberos, verify=(not cls.ENV.use_kerberos)) cls.hdfs.mkdir(cls.tmp_dir)
def make_ibis_client(): ic = ibis.impala.connect(host=ENV.impala_host, port=ENV.impala_port, protocol=ENV.impala_protocol, use_kerberos=ENV.use_kerberos) if ENV.use_kerberos: print("Warning: ignoring invalid Certificate Authority errors") hc = ibis.hdfs_connect(host=ENV.nn_host, port=ENV.webhdfs_port, use_kerberos=ENV.use_kerberos, verify=(not ENV.use_kerberos)) return ibis.make_client(ic, hdfs_client=hc)
def make_connection(): ic = ibis.impala_connect(host=ENV.impala_host, port=ENV.impala_port, protocol=ENV.impala_protocol, use_kerberos=ENV.use_kerberos) if ENV.use_kerberos: print("Warning: ignoring invalid Certificate Authority errors") hdfs = ibis.hdfs_connect(host=ENV.nn_host, port=ENV.webhdfs_port, use_kerberos=ENV.use_kerberos, verify=(not ENV.use_kerberos)) return ibis.make_client(ic, hdfs_client=hdfs)
def setUpClass(cls): cls.ENV = ENV cls.tmp_dir = pjoin(cls.ENV.tmp_dir, util.guid()) if cls.ENV.auth_mechanism in ['GSSAPI', 'LDAP']: print("Warning: ignoring invalid Certificate Authority errors") cls.hdfs = ibis.hdfs_connect(host=cls.ENV.nn_host, port=cls.ENV.webhdfs_port, auth_mechanism=cls.ENV.auth_mechanism, verify=(cls.ENV.auth_mechanism not in ['GSSAPI', 'LDAP'])) cls.hdfs.mkdir(cls.tmp_dir)
def setUpClass(cls): cls.ENV = ENV cls.tmp_dir = pjoin(cls.ENV.tmp_dir, util.guid()) if cls.ENV.use_kerberos: print("Warning: ignoring invalid Certificate Authority errors") # NOTE: specifying superuser as set in IbisTestEnv cls.hdfs = ibis.hdfs_connect(host=cls.ENV.nn_host, port=cls.ENV.webhdfs_port, use_kerberos=cls.ENV.use_kerberos, verify=(not cls.ENV.use_kerberos), user=cls.ENV.hdfs_superuser) cls.hdfs.mkdir(cls.tmp_dir)
def setUpClass(cls): cls.ENV = ENV cls.tmp_dir = pjoin(cls.ENV.tmp_dir, util.guid()) if cls.ENV.auth_mechanism in ['GSSAPI', 'LDAP']: print("Warning: ignoring invalid Certificate Authority errors") cls.hdfs = ibis.hdfs_connect(host=cls.ENV.nn_host, port=cls.ENV.webhdfs_port, auth_mechanism=cls.ENV.auth_mechanism, verify=(cls.ENV.auth_mechanism not in ['GSSAPI', 'LDAP']), user=cls.ENV.webhdfs_user) cls.hdfs.mkdir(cls.tmp_dir)
def _create_777_tmp_dir(cls): base = pjoin(cls.tmp_dir, util.guid()) tmp_path = pjoin(base, util.guid()) env = IbisTestEnv() superuser_hdfs = ibis.hdfs_connect(host=env.nn_host, port=env.webhdfs_port, auth_mechanism=env.auth_mechanism, verify=(env.auth_mechanism not in ['GSSAPI', 'LDAP']), user=env.hdfs_superuser) superuser_hdfs.mkdir(base) superuser_hdfs.chmod(base, '777') return tmp_path
def make_ibis_client(env): hc = ibis.hdfs_connect(host=env.nn_host, port=env.webhdfs_port, auth_mechanism=env.auth_mechanism, verify=env.auth_mechanism not in ['GSSAPI', 'LDAP'], user=env.webhdfs_user) auth_mechanism = env.auth_mechanism if auth_mechanism == 'GSSAPI' or auth_mechanism == 'LDAP': logger.warning('Ignoring invalid Certificate Authority errors') return ibis.impala.connect(host=env.impala_host, port=env.impala_port, auth_mechanism=env.auth_mechanism, hdfs_client=hc, pool_size=16)
def setUpClass(cls): from ibis.backends.impala.tests.conftest import IbisTestEnv cls.ENV = IbisTestEnv() cls.tmp_dir = pjoin(cls.ENV.tmp_dir, util.guid()) if cls.ENV.auth_mechanism in ['GSSAPI', 'LDAP']: print("Warning: ignoring invalid Certificate Authority errors") cls.hdfs = ibis.hdfs_connect( host=cls.ENV.nn_host, port=cls.ENV.webhdfs_port, auth_mechanism=cls.ENV.auth_mechanism, verify=(cls.ENV.auth_mechanism not in ['GSSAPI', 'LDAP']), user=cls.ENV.webhdfs_user, ) cls.hdfs.mkdir(cls.tmp_dir)
def __init__(self, table_csv="newhouselog_csv", table="newhouselog", local_path=HIVE_NEWHOUSELOG_CSV_PATH, hive_path=HIVE_SERVER_NEWHOUSELOG_CSV_PATH): self.table_csv = table_csv self.table = table self.local_path = local_path self.hive_path = hive_path self.hdfs = ibis.hdfs_connect(host=HIVE_URL, port=HIVE_PORT) self.client = ibis.impala.connect(host=HIVE_URL, database='user_track', hdfs_client=self.hdfs) conn = connect(host=HIVE_URL) self.cursor = conn.cursor()
def connect(cls, data_directory): env = ImpalaEnv() hdfs_client = ibis.hdfs_connect(host=env.nn_host, port=env.webhdfs_port, auth_mechanism=env.auth_mechanism, verify=env.auth_mechanism not in ['GSSAPI', 'LDAP'], user=env.webhdfs_user) auth_mechanism = env.auth_mechanism if auth_mechanism == 'GSSAPI' or auth_mechanism == 'LDAP': print("Warning: ignoring invalid Certificate Authority errors") return ibis.impala.connect(host=env.impala_host, port=env.impala_port, auth_mechanism=env.auth_mechanism, hdfs_client=hdfs_client, database='ibis_testing')
def hdfs(env, tmp_dir): pytest.importorskip('requests') if env.auth_mechanism in {'GSSAPI', 'LDAP'}: warnings.warn("Ignoring invalid Certificate Authority errors") client = ibis.hdfs_connect(host=env.nn_host, port=env.webhdfs_port, auth_mechanism=env.auth_mechanism, verify=env.auth_mechanism not in {'GSSAPI', 'LDAP'}, user=env.webhdfs_user) if not client.exists(tmp_dir): client.mkdir(tmp_dir) client.chmod(tmp_dir, '777') return client
def connect_test(env, with_hdfs=True): con = ibis.impala_connect(host=env.impala_host, protocol=env.impala_protocol, database=env.test_data_db, port=env.impala_port, use_kerberos=env.use_kerberos, pool_size=2) if with_hdfs: if env.use_kerberos: print("Warning: ignoring invalid Certificate Authority errors") hdfs_client = ibis.hdfs_connect(host=env.nn_host, port=env.webhdfs_port, use_kerberos=env.use_kerberos, verify=(not env.use_kerberos)) else: hdfs_client = None return ibis.make_client(con, hdfs_client)
def make_ibis_client(env): hc = ibis.hdfs_connect( host=env.nn_host, port=env.webhdfs_port, auth_mechanism=env.auth_mechanism, verify=env.auth_mechanism not in ['GSSAPI', 'LDAP'], user=env.webhdfs_user, ) auth_mechanism = env.auth_mechanism if auth_mechanism == 'GSSAPI' or auth_mechanism == 'LDAP': logger.warning('Ignoring invalid Certificate Authority errors') return ibis.impala.connect( host=env.impala_host, port=env.impala_port, auth_mechanism=env.auth_mechanism, hdfs_client=hc, pool_size=16, )
def connect_test(env, with_hdfs=True): if with_hdfs: if env.auth_mechanism in ['GSSAPI', 'LDAP']: print("Warning: ignoring invalid Certificate Authority errors") hdfs_client = ibis.hdfs_connect(host=env.nn_host, port=env.webhdfs_port, auth_mechanism=env.auth_mechanism, verify=(env.auth_mechanism not in ['GSSAPI', 'LDAP'])) else: hdfs_client = None return ibis.impala.connect(host=env.impala_host, database=env.test_data_db, port=env.impala_port, auth_mechanism=env.auth_mechanism, pool_size=2, hdfs_client=hdfs_client)
def hdfs(env, tmp_dir): pytest.importorskip('requests') if env.auth_mechanism in {'GSSAPI', 'LDAP'}: warnings.warn("Ignoring invalid Certificate Authority errors") client = ibis.hdfs_connect( host=env.nn_host, port=env.webhdfs_port, auth_mechanism=env.auth_mechanism, verify=env.auth_mechanism not in {'GSSAPI', 'LDAP'}, user=env.webhdfs_user, ) if not client.exists(tmp_dir): client.mkdir(tmp_dir) client.chmod(tmp_dir, '777') return client
def connect(cls, module): hc = ibis.hdfs_connect( host=ENV.nn_host, port=ENV.webhdfs_port, auth_mechanism=ENV.auth_mechanism, verify=ENV.auth_mechanism not in ['GSSAPI', 'LDAP'], user=ENV.webhdfs_user ) auth_mechanism = ENV.auth_mechanism if auth_mechanism == 'GSSAPI' or auth_mechanism == 'LDAP': print("Warning: ignoring invalid Certificate Authority errors") return module.connect( host=ENV.impala_host, port=ENV.impala_port, auth_mechanism=ENV.auth_mechanism, hdfs_client=hc, database='ibis_testing' )
def connect(data_directory: Path) -> ibis.client.Client: from ibis.backends.impala.tests.conftest import IbisTestEnv env = IbisTestEnv() hdfs_client = ibis.hdfs_connect( host=env.nn_host, port=env.webhdfs_port, auth_mechanism=env.auth_mechanism, verify=env.auth_mechanism not in ['GSSAPI', 'LDAP'], user=env.webhdfs_user, ) auth_mechanism = env.auth_mechanism if auth_mechanism == 'GSSAPI' or auth_mechanism == 'LDAP': print("Warning: ignoring invalid Certificate Authority errors") return ibis.impala.connect( host=env.impala_host, port=env.impala_port, auth_mechanism=env.auth_mechanism, hdfs_client=hdfs_client, database='ibis_testing', )
def test_create_table_with_location(self): base = pjoin(self.tmp_dir, util.guid()) name = 'test_{0}'.format(util.guid()) tmp_path = pjoin(base, name) # impala user has trouble writing to jenkins-owned dir so here we give # the tmp dir 777 superuser_hdfs = ibis.hdfs_connect(host=ENV.nn_host, port=ENV.webhdfs_port, auth_mechanism=ENV.auth_mechanism, verify=(ENV.auth_mechanism not in ['GSSAPI', 'LDAP']), user=ENV.hdfs_superuser) superuser_hdfs.mkdir(base) superuser_hdfs.chmod(base, '777') expr = self.alltypes table_name = _random_table_name() self.con.create_table(table_name, obj=expr, location=tmp_path, database=self.test_data_db) self.temp_tables.append('.'.join([self.test_data_db, table_name])) assert self.hdfs.exists(tmp_path)
# Note: must run 4.0_sparklyr.R to create the airlinse database import ibis import os ibis.options.interactive = True #host where httpfs lives # different parameters if using security hdfs_host = os.getenv('HDFS_HOST', 'ip-10-0-0-99.us-west-2.compute.internal') hdfs = ibis.hdfs_connect(host=hdfs_host, auth_mechanism='PLAIN', verify=False, use_https=False) hdfs.ls('/tmp') #host where impala daemon lives # impala_host = os.getenv('IMPALA_HOST', 'se-central-cdsw-cluster-2.vpc.cloudera.com') impala_host = os.getenv('IMPALA_HOST', 'ip-10-0-0-154.us-west-2.compute.internal') con = ibis.impala.connect('ip-10-0-0-154.us-west-2.compute.internal', hdfs_client=hdfs, database='flights') #con = ibis.impala.connect(host=impala_host, port=21050, # database='flights', hdfs_client=hdfs, # auth_mechanism='PLAIN', use_ssl=False) con.list_tables() import matplotlib import seaborn as sns import matplotlib.pyplot as plt airlines = con.table('airlines_bi_pq') airlines.limit(10).execute() airports = con.table('airports') airports.limit(10).execute()
# test.py # from impala.dbapi import connect # conn = connect(host='10.0.0.228', port=21080) # cursor = conn.cursor() # cursor.execute('SELECT * FROM mytable LIMIT 100') # print cursor.description # prints the result set's schema # results = cursor.fetchall() # from pyhive import presto # cursor = presto.connect('10.0.0.228',21080).cursor() # cursor.execute('SELECT * FROM my_awesome_data LIMIT 10') # print cursor.fetchone() # print cursor.fetchall() import ibis impala_host ='10.0.0.228' impala_port = 21000 webhdfs_host = '10.0.0.227' webhdfs_port = 50010 hdfs = ibis.hdfs_connect(host=webhdfs_host, port=webhdfs_port) con = ibis.impala.connect(host=impala_host, port=impala_port, hdfs_client=hdfs)
import hdfs import os # Testing impala connection impala_host = os.environ['IMPALA_HOST'] impala_port = int(os.environ['IMPALA_PORT']) webhdfs_host = os.environ['WEBHDFS_HOST'] webhdfs_port = int(os.environ['WEBHDFS_PORT']) # should replace with env var? #webhdfs_host = 'ec2-54-66-248-84.ap-southeast-2.compute.amazonaws.com' #webhdfs_port = 9870 #impala_host = 'ec2-54-66-248-84.ap-southeast-2.compute.amazonaws.com' #impala_port = 21050 hdfs = ibis.hdfs_connect(host=webhdfs_host, port=webhdfs_port) client = ibis.impala.connect(host=impala_host, port=impala_port, hdfs_client=hdfs) db = client.database('default') def get_wind_data(start: int, end: int) -> pd.DataFrame: table = db.wind filtered = table.filter([table.rowid > start, table.rowid < end]) df = filtered['speed', 'speederror', 'direction'] return df.execute()
# # ## Interactive Mode # Ibis also allows and interactive mode that automatically executes all # expressions. This can be useful in a notebook or repl. I personally prefer # to epxlicitly execute expresssions, but this is a personal preference. # If you use the interactive mode, I recommnd setting the defaultlimit low to # prevent accidentally trying to return an unreasonable number of rows to your # local process. To safely turn on interactive mode, you would run somehting # like the two commands: # # ibis.options.sql.default_limit = 10 # ibis.options.interactive = True ibis.options.sql.default_limit = None hdfs_conn = ibis.hdfs_connect(host='') ibis_conn = ibis.impala.connect(host='', hdfs_client=hdfs_conn) pageviews_tbl = ibis_conn.table('wiki_pageviews', database='u_juliet') # What is in a project name? What does this data look like? project_names_expr = pageviews_tbl.project_name.distinct() project_names = ibis_conn.execute(project_names_expr) project_names # From the data docs, we know that the post fixes have the following meanings: # # wikibooks: ".b" # wiktionary: ".d" # wikimedia: ".m"
# ====== Ibis conf (pour contournement d'un bug) ====== with ibis.config.config_prefix('impala'): ibis.config.set_option('temp_db', '`__ibis_tmp`') # ====== Connexion ====== # Connecting to Hive by providing Hive host ip and port (10000 by default) and a Webhdfs client # - Pour ajouter les valeurs de ces variables d'env. dans la plateforme Saagie: # - allez sur la manager, choisissez votre plateforme # - pour avoir la valeur de IP_HDFS: cliquez sur HDFS: dans le panneau qui s'ouvre, # notez la valeur de l'IP dans la section WebHDFS # - pour avoir la valeur de IP_HIVE: cliquez sur Hive: dans le panneau qui s'ouvre, # notez la valeur de l'IP dans la section HiveServer2 # - allez dans les Settings, ajoutez ces variables d'env. hdfs = ibis.hdfs_connect(host=os.environ['IP_HDFS'], port=50070) client = ibis.impala.connect(host=os.environ['IP_HIVE'], port=10000, hdfs_client=hdfs, user='******', password='******', auth_mechanism='PLAIN') # ====== Ecriture dans la table ====== # Creation d'une simple DataFrame pandas with 2 colonnes liste_hello = ['hello1', 'hello2'] liste_world = ['world1', 'world2'] df = pd.DataFrame(data={'hello': liste_hello, 'world': liste_world}) # Ecriture de la dataframe dans Hive si la table n'existe pas
import ibis import os ibis.options.interactive = True #host where httpfs lives hdfs_host = os.getenv('HDFS_HOST', 'ip-10-0-0-168.us-west-2.compute.internal') hdfs = ibis.hdfs_connect(host=hdfs_host, port=14000, auth_mechanism='GSSAPI', verify=True, use_https=False) hdfs.ls('/tmp') #host where impala daemon lives impala_host = os.getenv('IMPALA_HOST', 'ip-10-0-0-150.us-west-2.compute.internal') con = ibis.impala.connect(host=impala_host, port=21050, database='flights', hdfs_client=hdfs, auth_mechanism='GSSAPI', use_ssl=False) con.list_tables() import matplotlib import seaborn as sns import matplotlib.pyplot as plt airlines = con.table('airlines_bi_pq') airlines.limit(10).execute() airports = con.table('airports')
# Using stats fro mhttps://wiki.cloudera.com/pages/viewpage.action?spaceKey=EDH&title=Ad-hoc+Data+Analytics+over+Clusterstats+Data import ibis, os, sys import pandas as pd from IPython.display import display # connect to the EDH ibis.options.sql.default_limit = None hdfs_conn = ibis.hdfs_connect(host='lannister-001.edh.cloudera.com') ibis_conn = ibis.impala.connect(host='westeros.edh.cloudera.com', port=21050, auth_mechanism='GSSAPI', use_ssl=True, kerberos_service_name='impala', hdfs_client=hdfs_conn, timeout=300) # for interactive muddling. ibis.options.interactive = True ; # pd.set_option('display.width', 1000); pd.set_option('max_colwidth', 100); pd.set_option('display.max_rows', 500); UUID = "3f75166e-682e-4ad1-b5fe-33171a198e58" clustername = "PROD" collectts = 1506704298000 roletype = "DATANODE" def slowBlockreceiverCounts(UUID,clustername,collectts,roletype): sql=""" select host,count(message) as message_count from customer_logs where customerUUID = "{}" and
) # clickhouse conf['clickhouse'] = dict( host='localhost', port=9000, user='******', password='', database='ibis_testing' ) # impala _hdfs_client = ibis.hdfs_connect( host='impala', port=50070, auth_mechanism='NOSASL', verify=True, user='******', ) conf['impala'] = dict( host='localhost', port=21050, auth_mechanism='NOSASL', hdfs_client=_hdfs_client, database='ibis_testing', ) # spark conf['pyspark'] = dict( session=SparkSession.builder.getOrCreate() )