def get_hbase_connect_pool(self): """ :return: 获取happybase的连接池 """ pool = happybase.ConnectionPool(200, host=self.host, port=self.port) return pool
def get_connetion_pool(timeout=10): global conn_pool if conn_pool is None: conn_pool = happybase.ConnectionPool(10, timeout=timeout) return conn_pool
def update_user_ctr_feature_to_hbase(self): """ :return: """ clr.spark.sql("use profile") user_profile_hbase = self.spark.sql( "select user_id, information.birthday, information.gender, article_partial, env from user_profile_hbase" ) # 特征工程处理 # 抛弃获取值少的特征 user_profile_hbase = user_profile_hbase.drop('env', 'birthday', 'gender') def get_user_id(row): return int(row.user_id.split(":")[1]), row.article_partial user_profile_hbase_temp = user_profile_hbase.rdd.map(get_user_id) from pyspark.sql.types import * _schema = StructType([ StructField("user_id", LongType()), StructField("weights", MapType(StringType(), DoubleType())) ]) user_profile_hbase_schema = self.spark.createDataFrame( user_profile_hbase_temp, schema=_schema) def frature_preprocess(row): from pyspark.ml.linalg import Vectors channel_weights = [] for i in range(1, 26): try: _res = sorted([ row.weights[key] for key in row.weights.keys() if key.split(':')[0] == str(i) ])[:10] channel_weights.append(_res) except: channel_weights.append([]) return row.user_id, channel_weights res = user_profile_hbase_schema.rdd.map(frature_preprocess).collect() # 批量插入Hbase数据库中 pool = happybase.ConnectionPool(size=10, host='hadoop-master', port=9090) with pool.connection() as conn: ctr_feature = conn.table('ctr_feature_user') with ctr_feature.batch(transaction=True) as b: for i in range(len(res)): for j in range(25): b.put( "{}".format(res[i][0]).encode(), { "channel:{}".format(j + 1).encode(): str(res[i][1][j]).encode() }) conn.close()
def save_content_filter_history_to__recall(partition): """计算每个用户的每个操作文章的相似文章,过滤之后,写入content召回表当中(支持不同时间戳版本) """ import happybase pool = happybase.ConnectionPool(size=10, host='hadoop-master') # 进行为相似文章获取 with pool.connection() as conn: # key: article_id, column: similar:article_id similar_table = conn.table('article_similar') # 循环partition for row in partition: # 获取相似文章结果表 similar_article = similar_table.row(str( row.article_id).encode(), columns=[b'similar']) # 相似文章相似度排序过滤,召回不需要太大的数据, 百个,千 _srt = sorted(similar_article.items(), key=lambda item: item[1], reverse=True) if _srt: # 每次行为推荐10篇文章 reco_article = [ int(i[0].split(b':')[1]) for i in _srt ][:10] # 获取历史看过的该频道文章 history_table = conn.table('history_recall') # 多个版本 data = history_table.cells( 'reco:his:{}'.format(row.user_id).encode(), 'channel:{}'.format(row.channel_id).encode()) history = [] if len(data) >= 2: for l in data[:-1]: history.extend(eval(l)) else: history = [] # 过滤reco_article与history reco_res = list(set(reco_article) - set(history)) # 进行推荐,放入基于内容的召回表当中以及历史看过的文章表当中 if reco_res: # content_table = conn.table('cb_content_recall') content_table = conn.table('cb_recall') content_table.put( "recall:user:{}".format(row.user_id).encode(), { 'content:{}'.format(row.channel_id).encode( ): str(reco_res).encode() }) # 放入历史推荐过文章 history_table.put( "reco:his:{}".format(row.user_id).encode(), { 'channel:{}'.format(row.channel_id).encode( ): str(reco_res).encode() }) conn.close()
def get_similar_online_recall(rdd): """ 解析rdd中的内容,然后进行获取计算 :param rdd: :return: """ # rdd---> 数据本身 # [row(1,2,3), row(4,5,6)]----->[[1,2,3], [4,5,6]] import happybase # 初始化happybase连接 pool = happybase.ConnectionPool(size=10, host='hadoop-master', port=9090) for data in rdd.collect(): # 进行data字典处理过滤 if data['param']['action'] in ["click", "collect", "share"]: logger.info( "{} INFO: get user_id:{} action:{} log".format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), data['param']['userId'], data['param']['action'])) # 读取param当中articleId,相似的文章 with pool.connection() as conn: sim_table = conn.table("article_similar") # 根据用户点击流日志涉及文章找出与之最相似文章(基于内容的相似),选取TOP-k相似的作为召回推荐结果 _dic = sim_table.row(str(data["param"]["articleId"]).encode(), columns=[b"similar"]) _srt = sorted(_dic.items(), key=lambda obj: obj[1], reverse=True) # 按相似度排序 if _srt: topKSimIds = [int(i[0].split(b":")[1]) for i in _srt[:10]] # 根据历史推荐集过滤,已经给用户推荐过的文章 history_table = conn.table("history_recall") _history_data = history_table.cells( b"reco:his:%s" % data["param"]["userId"].encode(), b"channel:%d" % data["channelId"] ) # print("_history_data: ", _history_data) # history = [] # if len(data) >= 2: # for l in data[:-1]: # history.extend(eval(l)) # else: # history = [] history = [] if len(_history_data) > 1: for l in _history_data: history.extend(l) # 根据历史召回记录,过滤召回结果 recall_list = list(set(topKSimIds) - set(history)) # 如果有推荐结果集,那么将数据添加到cb_recall表中,同时记录到历史记录表中 logger.info( "{} INFO: store online recall data:{}".format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), str(recall_list))) if recall_list: recall_table = conn.table("cb_recall") recall_table.put( b"recall:user:%s" % data["param"]["userId"].encode(), {b"online:%d" % data["channelId"]: str(recall_list).encode()} ) history_table.put( b"reco:his:%s" % data["param"]["userId"].encode(), {b"channel:%d" % data["channelId"]: str(recall_list).encode()} ) conn.close()
import happybase import json import sha1_tools hbase_conn_timeout = None pool = happybase.ConnectionPool(size=12, host='10.1.94.57', timeout=hbase_conn_timeout) sha1_tools.pool = pool global_var = json.load(open('../../conf/global_var_all.json')) sha1_tools.global_var = global_var sha1_mysql = sha1_tools.get_SHA1_from_MySQL(1) print sha1_mysql sha1_mysql = sha1_tools.get_SHA1_from_MySQL(151) print sha1_mysql sha1_mysql = sha1_tools.get_SHA1_from_MySQL(10) print sha1_mysql sha1_aaron = sha1_tools.compute_SHA1_for_image_id_from_tab_aaron( 1, "aaron_memex_ht-images") print sha1_aaron sha1_aaron = sha1_tools.compute_SHA1_for_image_id_from_tab_aaron( 10, "aaron_memex_ht-images") print sha1_aaron sha1s_mysql = sha1_tools.get_batch_SHA1_from_mysql(["1", "10", "151"]) print sha1s_mysql sha1s_mysql = sha1_tools.get_batch_SHA1_from_mysql([1, 10, 151]) print sha1s_mysql
# sqllite settings vuri = ':memory:' dbc = apsw.Connection(vuri) #xbee connection ser = serial.Serial('/dev/ttyAMA0', 9600, timeout=5) xbee = ZigBee(ser,escaped=True) #basic sensor keys knownprekeys = ['40b5af00_rx000A01_','40b5af00_rx000A02_','40b5af00_rx000A03_','40b5af00_rx000A04_','40b5af00_rx000A05_','40b5af00_rx000A06_','40b5af00_rx000A07_','40b5af01_rx000A01_','40b5af01_rx000A02_','40b5af01_rx000A03_','40b5af01_rx000A04_','40b5af01_rx000A05_','40b5af01_rx000A07_','40b5af01_rx000B01_','40b5af01_rx000B02_','40b5af01_rx000B03_','40b5af01_rx000B04_'] time.sleep(2) #Happybase connection pool to HBase server. Usses ssh portforwarding to connect to remote host. hpool = happybase.ConnectionPool(6,host='localhost') #classes used for multithreading. class myThreadInsert (threading.Thread): def __init__(self): threading.Thread.__init__(self) def run(self): try: xinsert() except Exception: logging.exception("xinsert") class myThreadRead (threading.Thread): def __init__(self):
import datetime import time import numpy as np import happybase from elasticsearch_dsl import connections, Search import faiss pool = happybase.ConnectionPool(size=10, host='localhost', port=9091) connections.create_connection(hosts=['localhost'], timeout=20) faiss_model_path = "faiss.model" index = faiss.read_index(faiss_model_path) model_update_time = "" def get_user_profile_recall(user_id, num_items): """ 用户偏好召回,hbase取用户小类偏好top1,2,3,es检索再根据上市时间排名 :param user_id: :param num_items: :return: item_list """ with pool.connection() as conn: table = conn.table('TOPIC_LIKE') row = table.row(user_id, columns=[b'INFO:PTY1', b'INFO:PTY2', b'INFO:PTY3']) conn.close() search_size = {b"INFO:PTY1": 0.5, b"INFO:PTY2": 0.3, b"INFO:PTY3": 0.2} item_list = []
#coding=utf-8 import sys sys.path.append("../configs") sys.path.append("configs") import settings import happybase import json import logging pool = happybase.ConnectionPool(size=settings.hbase_pool_size, \ host=settings.hbase_host, \ table_prefix=settings.hbase_table_prefix,\ protocol='compact') # conn = happybase.Connection(host=settings.hbase_host,\ # table_prefix=settings.hbase_table_prefix,\ # protocol="compact") def create_table(table_name): try: with pool.connection() as conn: conn.create_table(table_name, { 'index': dict(max_versions=1), 'data': dict(max_versions=1) }) except Exception, e: logging.exception(e) return False return True
scorelist = [] count = 0 for y in x[1][1]: scorelist.append(y) count += 1 scorelist.sort(reverse=True) # make sure we don"t overwhelm users by too many photos under the same tag if count > NUM_PHOTO_PER_TAG: scorelist = scorelist[:NUM_PHOTO_PER_TAG] return x[0], (x[1][0], scorelist) # sample input # (2, (u"ptag2", [(0.7692307692307693, "{"photo": {"timeposted": 1422939564, "description": "pdes", "title": "ptitle", "URL": "purl", "tags": "ptag1,ptag2,ptag3", "pid": "101", "location": {"latitude": "plat", "longitude": "plon"}}, "numViewed": 3, "numLiked": 10}"), (0.4230769230769231, "{"photo": {"timeposted": 1422939564, "description": "pdes", "title": "ptitle", "URL": "purl", "tags": "ptag1,ptag2,ptag3", "pid": "103", "location": {"latitude": "plat", "longitude": "plon"}}, "numViewed": 15, "numLiked": 11}")])) POOL = happybase.ConnectionPool(size=30, host="c0tl.com") def writeToHBase(x): # print "count ", x[0] # print "tag name", str(x[1][0]) # print "first photo", x[1][1][0] print "writing to hbase.., cout,", x[0] plist = x[1][1] pdict = {} for i in range(len(plist)): pdict[i] = json.loads(plist[i][1]) with POOL.connection() as connection: tagview = connection.table('top_tags') rowkey = "%016i" % int(x[0]) + hashlib.md5(str(x[1][0])).digest() tagview.put(rowkey, {
#! /usr/bin/python import happybase pool = happybase.ConnectionPool(1, host='localhost', port=9090) from collections import defaultdict, namedtuple import happybase TaskContent = namedtuple('TaskContent', [ 'submit_at', 'from_reverse', 'site_asset_id', 'deadline', 'id', 'retries', 'account', 'uuid', 'created_at', 'format', 'priority', 'scope', 'queued_at', 'dna_url' ]) matches = [{ 'video_score': 99, 'meta_uuid': '970ae0ba-773b-11e1-a7b2-080027cf46d6', 'video_sample_offset': 0, 'match_type': 'video', 'meta_name': 'Auto_Rule306_Movie', 'video_ref_offset': 0, 'audio_sample_offset': 0, 'audio_score': 0, 'audio_duration': 0, 'track_id': 0.0, 'instance_id': '9752d1cc-773b-11e1-a7b2-080027cf46d6', 'audio_ref_offset': 0, 'clip_duration': 307, 'media_type': 'video', 'video_duration': 307, 'instance_name': 'cappella.flv.xfp.0'
import json import datetime import sys import pandas as pd import os import happybase from api2.mysql import mysql # connection = happybase.Connection(host='120.27.241.54', transport='framed', protocol='compact') # connection.open() # table = connection.table('usersize_recommend') pool = happybase.ConnectionPool(size=10, host='120.27.241.54', transport='framed', protocol='compact') class recommendProduct(object): """docstring for ClassName""" def __init__(self, ): pass def computedUserRecommendProd(userid): print(userid) # userid = userid.decode('utf8') # userid = json.loads(userid) # userid = userid['userid'] logging.info(userid)
# -*- coding: utf-8 -*- import happybase pool = happybase.ConnectionPool(host='localhost', port=9090, size=10)
def __init__(self, ): self.conn = hconn.ConnectionPool(size=8, host='133.0.6.89') self.table = b'vip:tian_yan' self.html_col = b'data:html' self.summary_col = b'data:summary'
def create_data_from_station_data(first, second): """this function creates the data analyzing the two stations in comparison""" global hdfs #global hdfs object global hbase #global hbase object if (hdfs is None): from pywebhdfs.webhdfs import PyWebHdfsClient hdfs = PyWebHdfsClient(host='cshadoop.boisestate.edu', port='50070', user_name='uacharya') if (hbase is None): import happybase hbase = happybase.ConnectionPool(size=1, host='cshadoop.boisestate.edu') date_for_comparision = first["Date"].strip() # creating directory for each date try: hdfs.get_file_dir_status('user/uacharya/single_screen/' + date_for_comparision) except Exception: # directory to hold dataset in csv file for reach node in wall display starting from 1 to 9 content = 'Date,ID,Source,Destination,S_Lat,S_Lon,D_Lat,D_Lon,Wind_Lat,Wind_Lon,Wind_Velocity\n' try: hdfs.create_file('user/uacharya/single_screen/' + date_for_comparision + '/data/output.csv', content, replication=1) except Exception: pass dataset = { 'node_1': [], 'node_2': [], 'node_3': [] } for data in broadcast_variable.value: compare_data_between(date_for_comparision, first, data, dataset) # for key in dataset: # if(len(dataset[key])!=0): # content = "\n".join(dataset[key]); # content +="\n"; # while(True): # try: # hdfs.append_file('user/uacharya/simulation/'+date+'/'+key+'/output.csv',content,buffersize=4096); # break; # except Exception: # time.sleep(0.2); # continue; dataset.clear() #clearing the dictionary # append over here after all the global variable has been made return second
def save_hbase(entries): pool = happybase.ConnectionPool(size=3, host=HBASE_HOST) for entry in entries: with pool.connection() as connection: table = connection.table(HBASE_TABLE) table.put(entry[0], entry[1])
# escorts_images_sha1_infos_from_ts_subsampled_newformat => ht_images_infos_merged_subsampled # discard ad:*. ext:sbcmdline (at least for real transform). just do not put them in mappings # mappings should be an array of arrays like: # ["ext:dlib*", "data:dlib*"] # ["ext:sbpycaffe*", "data:sbpycaffe*"] # ["info:s3_url", "data:s3_url"] # Could be a parameter in conf HAPPYBASE_HOST = '10.108.16.137' # TODO: should we also transform update table? # Try to create "tab_name_out" HBASE_TIMEOUT = None NB_THREADS = 1 POOL = happybase.ConnectionPool(size=NB_THREADS, host=HAPPYBASE_HOST, timeout=HBASE_TIMEOUT) with POOL.connection() as CONN: get_create_table(TAB_NAME_OUT, CONN, TAB_OUT_FAMILIES) # Setup spark job SC = SparkContext(appName='transform_' + TAB_NAME_IN + '_to_' + TAB_NAME_OUT) SC.setLogLevel("ERROR") CONF = SparkConf() HBASE_MAN_IN = HbaseManager(SC, CONF, HBASE_HOST_SPARK, TAB_NAME_IN) HBASE_MAN_OUT = HbaseManager(SC, CONF, HBASE_HOST_SPARK, TAB_NAME_OUT) transform_table() print("Transformation completed.")
from django.conf import settings from django.http import HttpResponse import happybase logger = logging.getLogger(__name__) N_KEYS = 10000 # # Initialization # # Importing this module has side-effects; way to go Django. :s # pool = happybase.ConnectionPool(size=3, host=settings.HBASE_HOST) def populate_table(): with pool.connection() as connection: connection.delete_table(settings.HBASE_TABLE, disable=True) connection.create_table(settings.HBASE_TABLE, families={'cf': {}}) table = connection.table(settings.HBASE_TABLE) with table.batch() as b: for i in xrange(N_KEYS): row_data = {'cf:col1': 'value-%d' % i} b.put('row-key-%d' % i, row_data) with pool.connection() as connection: if not settings.HBASE_TABLE in connection.tables():
import functools import common import crawler import happybase from . import misc conf = common.args.hbase_conf host = conf["hbase_thrift_host"] port = conf["hbase_thrift_port"] table_prefix = conf["table_prefix"] table_prefix_separator = conf["table_prefix_separator"] hbase_pool = happybase.ConnectionPool( size=3, host=host, port=port, table_prefix=table_prefix, table_prefix_separator=table_prefix_separator ) # 模块对外接口,用偏函数实现 # get_job_rule = functools.partial(misc._get_job_rule,hbase_pool) # set_job_rule = functools.partial(misc._set_job_rule,hbase_pool) # save_job = functools.partial(misc._save_job,hbase_pool) # remove_job = functools.partial(misc._remove_job,hbase_pool) # save_results = functools.partial(misc._save_results,hbase_pool) def get_job_rule(job_name) -> crawler.CrawlJobCore: ''' 获取 hbase 里的 crawl_job_core (爬取规则)
def get_similar_online_recall(rdd): import happybase pool = happybase.ConnectionPool(size=10, host='hadoop-master', port=9090) # 解析rdd中的内容,然后进行获取计算 # rdd的[row(1,2,3), row(4,5,6)] -----> rdd.collect()的[[1,2,3], [4,5,6]] for data in rdd.collect(): # 进行data字典处理过滤 if data['param']['action'] in ["click", "collect", "share"]: logger.info( "{} INFO: get user_id:{} action:{} log".format( datetime.now().strftime('%Y-%m-%d %H:%M:%S'), data['param']['userId'], data['param']['action'])) # 读取param当中articleId,相似的文章 with pool.connection() as conn: sim_table = conn.table("article_similar") # 根据用户点击流日志涉及文章找出与之最相似文章(基于内容的相似),选取TOP-k相似的作为召回推荐结果 _dic = sim_table.row(str( data["param"]["articleId"]).encode(), columns=[b"similar"]) if _dic: logger.info("_dic is " + str(_dic)) # {b'similar:1': b'0.2', b'similar:2': b'0.34', b'similar:3': b'0.267', b'similar:4': b'0.56', b'similar:5': b'0.7', b'similar:6': b'0.819', b'similar:8': b'0.28'} _srt = sorted(_dic.items(), key=lambda obj: obj[1], reverse=True) # 按相似度排序 logger.info("_srt is " + str(_srt)) # [(b'similar:6', b'0.819'), (b'similar:5', b'0.7'), (b'similar:4', b'0.56'), (b'similar:2', b'0.34'), (b'similar:8', b'0.28'), (b'similar:3', b'0.267'), (b'similar:1', b'0.2')] topKSimIds = [ int(i[0].split(b":")[1]) for i in _srt[:10] ] logger.info("topKSimIds is " + str(topKSimIds)) # [6, 5, 4, 2, 8, 3, 1] # 根据历史推荐集history_recall进行过滤(已经给用户推荐过的文章) history_table = conn.table("history_recall") _history_data = history_table.cells( b"reco:his:%s" % data["param"]["userId"].encode(), b"channel:%d" % data["channelId"]) logger.info("_history_data is " + str(_history_data)) history = [] if len(_history_data) >= 1: for l in _history_data: history.extend(eval(l)) logger.info("history is " + str(history)) # 根据历史召回记录,过滤召回结果 recall_list = list(set(topKSimIds) - set(history)) logger.info("recall_list is " + str(recall_list)) # 如果有推荐结果集,那么将数据添加到cb_recall表中,同时记录到历史记录表中 logger.info( "{} INFO: store online recall data:{}".format( datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), str(recall_list))) if recall_list: recall_table = conn.table("cb_recall") recall_table.put( b"recall:user:%s" % data["param"]["userId"].encode(), { b"online:%d" % data["channelId"]: str(recall_list).encode() }) history_table.put( b"reco:his:%s" % data["param"]["userId"].encode(), { b"channel:%d" % data["channelId"]: str(recall_list).encode() }) conn.close() logger.info("-" * 30)
from kafka import KafkaConsumer import time import happybase import json hbase_ip = '127.0.0.1' hbase_port = 9090 ip = hbase_ip port = hbase_port pool = happybase.ConnectionPool(size=3, host=ip) # 往tableName里插数据 def hbase_load(tableName, lists): with pool.connection() as connection: connection.open() if tableName not in str(connection.tables()): create_table(connection, tableName) # print(tableName,str(connection.tables())) table = connection.table(tableName) b = table.batch(batch_size=1024) for li in lists: try: rowkey = li['info'] data_dicts = {} for d, x in li.items(): key = "ss:" + d value = str(x) data_dicts[key] = value b.put(row=rowkey, data=data_dicts) b.send()
import happybase # gives error # TSocket read 0 bytes # [Errno 32] Broken pipe if __name__ == "__main__": pool = happybase.ConnectionPool(size=1, host="10.1.94.57") with pool.connection() as conn: table_name = "escorts_images_sha1_infos_dev" hbase_table = conn.table(table_name) batch_list_queries = ["000421227D83DA48DB4A417FCEFCA68272398B8E"] rows = hbase_table.rows(batch_list_queries) print rows
def insert_row(batch, row): batch.put(str(row), {"data:value": str(row + 10)}) print "Insert row %i" % (row) def delete_row(batch, row): batch.delete(str(row)) print "Delete row %i" % (row) # Start to run # connection, table, batch = connect_to_hbase() pool = happybase.ConnectionPool(size=3, host=host, table_prefix=namespace, table_prefix_separator=':', port=9090) with pool.connection() as connection: # print "Connect to HBase. batch size: %i" % (batch_size) print(connection.tables()) table = connection.table(name=table_name) batch = table.batch(batch_size=batch_size) for row in range(1, 10000): insert_row(batch, row) batch.send() # with batch: # insert_row(batch, row)
print u"%s 结束HBASE插入" % time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) print u"插入耗时: %s s" % ((end_put_time - start_put_time).seconds) #b.send() #进行GMT的处理 second_storing = Second_Storing() second_storing.merge_GMT_time(file[0:21]) happybase_end_time = datetime.datetime.now() #print u"存入耗时: %s"%((happybase_end_time - happybase_start_time).seconds) pool = happybase.ConnectionPool( size=66, host='10.210.180.43', port=9090, timeout=None, autoconnect=True, compat='0.94', ) from multiprocessing import Pool import os, time, random def put_data(table_name, list_put_table_data, counter_list_all_para): cut_number = (counter_list_all_para // 74) + 1 #cut_number = 1 print "进程数: %s" % cut_number list_cut = div_list(list_put_table_data, cut_number) print list_cut[0][0][0], list_cut[0][-1][0] #print list_cut[1][0][0], list_cut[1][-1][0]
# -*- coding: UTF-8 -*- import happybase from setting.default import DefaultConfig import redis pool = happybase.ConnectionPool(size=10, host='hadoop-master', port=9090) # 召回数据 # 加上decode_responses=True,写入的键值对中的value为str类型,不加这个参数写入的则为字节类型。 redis_client = redis.StrictRedis(host=DefaultConfig.REDIS_HOST, port=DefaultConfig.REDIS_PORT, db=10, decode_responses=True) # 用于缓存的Redis数据库 # 加上decode_responses=True,写入的键值对中的value为str类型,不加这个参数写入的则为字节类型。 cache_client = redis.StrictRedis(host=DefaultConfig.REDIS_HOST, port=DefaultConfig.REDIS_PORT, db=8, decode_responses=True) # 在 sort_service.py 排序逻辑中使用 from pyspark import SparkConf from pyspark.sql import SparkSession # spark配置 conf = SparkConf() conf.setAll(DefaultConfig.SPARK_GRPC_CONFIG) SORT_SPARK = SparkSession.builder.config(conf=conf).getOrCreate()
def get_connection_pool(self, size=128, **kw): self.pool = happybase.ConnectionPool(**kw, size=size) return self.pool
def get_hb_conn(): hbase_pool = happybase.ConnectionPool(size=2, host=Read().hbase_host) return hbase_pool
print("this is the driver container") # getting the header of the whole dataset header = distributed_dataset.first() # filtering the header out of the data distributed_dataset = distributed_dataset.filter(lambda d: d != header) # mapping the data to prepare for processing data_in_required_format = distributed_dataset.map( create_required_datewise_data) data_in_required_format.cache() #collecting keys to do batch processing based on keys temp = set(data_in_required_format.keys().collect()) print("total keys " + str(len(temp))) #sorting keys to create data in chronological order based on date sorted_keys = sorted(temp, key=int) #connecting to database for writing checker data database = happybase.ConnectionPool(size=1, host='cshadoop.boisestate.edu') #getting a connection from the pool # with database.connection() as db: # db.create_table('fChecker'.encode(),{'f'.encode():dict(max_versions=1,in_memory=True)}); #creating batch processing with new rdd each iteration based on key values for key in sorted_keys[:2]: print(key) keyed_rdd = data_in_required_format.filter(lambda t: t[0] == key).map( lambda t: t[1]).coalesce(48, shuffle=True) keyed_rdd.cache() #collecting all the dataset for broadcasting broadcast_data = keyed_rdd.collect() print(str(len(broadcast_data)) + " driver program") # l = keyed_rdd.glom().map(len).collect() # get length of each partition # print(min(l), max(l), sum(l)/len(l), len(l)) # check if skewed # broadcasting the entire keyed dataset
def user_table(): with pool.connection() as connection: user=connection.table('user') scaner=user.scan() for key, data in scaner: print key, data def user_table(): with pool.connection() as connection: connection.enable_table('movie') movie=connection.table('movie') scaner=movie.scan() for key, data in scaner: print key, data pool = happybase.ConnectionPool(size=3, host='hadoop_env.com', table_prefix='pool_test') try: thread.start_new_thread( user_table ) thread.start_new_thread( movie_table ) except: print "Error: 无法开启线程"
def __init__(self, host, table_prefix, table_name): self.pool = hb.ConnectionPool(size=16, host=host, autoconnect=True, table_prefix=table_prefix) self.table_name = table_name