def puts(self, rowKeys, values, qualifier='1'): """ put sevel rows, `qualifier` is autoincrement :param rowKeys: a single rowKey :param values: values is a 2-dimension list, one piece element is [name, sex, age] :param qualifier: column family qualifier Usage:: >>> HBaseTest().puts('test', [['lee', 'f', '27'], ['clark', 'm', 27], ['dan', 'f', '27']]) """ mutationsBatch = [] if not isinstance(rowKeys, list): rowKeys = [rowKeys] * len(values) for i, value in enumerate(values): mutations = [] for j, column in enumerate(value): if isinstance(column, str): m_name = Hbase.Mutation(column=self.columnFamilies[j] + ':' + qualifier, value=column) elif isinstance(column, int): m_name = Hbase.Mutation(column=self.columnFamilies[j] + ':' + qualifier, value=encode(column)) mutations.append(m_name) qualifier = str(int(qualifier) + 1) mutationsBatch.append( Hbase.BatchMutation(row=rowKeys[i], mutations=mutations)) self.client.mutateRows(self.table, mutationsBatch, {})
def puts(self, rowKeys, qualifier, values): """ put sevel rows, `qualifier` is autoincrement :param rowKeys: a single rowKey :param values: values is a 2-dimension list, one piece element is [name, sex, age] :param qualifier: column family qualifier Usage:: >>> HBaseTest('table').puts(rowKeys=[1,2,3],qualifier="name",values=[1,2,3]) """ mutationsBatch = [] if not isinstance(rowKeys, list): rowKeys = [rowKeys] * len(values) for i, value in enumerate(values): mutations = [] # for j, column in enumerate(value): if isinstance(value, str): value = value.encode('utf-8') m_name = Hbase.Mutation(column=(self.columnFamilies[0] + ':' + qualifier).encode('utf-8'), value=value) elif isinstance(value, int): m_name = Hbase.Mutation(column=(self.columnFamilies[0] + ':' + qualifier).encode('utf-8'), value=encode(value)) mutations.append(m_name) mutationsBatch.append( Hbase.BatchMutation(row=rowKeys[i].encode('utf-8'), mutations=mutations)) self.client.mutateRows(self.table, mutationsBatch, {})
def write_hbase(data, table_name, ip, server_port): """ 将数据写入Hbase中 :param data: 包含数据的迭代器,单条数据为dict类型,比如 {'img_oss' = 'http://bj-image.oss-cn-hangzhou-internal. aliyuncs.com/6321965c0c96f1ea809b15ad757252f3.jpeg', 'img_type' = ['line_chart']} :param table_name: 需要推送的目标表的表名 :param ip: 推送的目标thrift ip :param server_port: 推送的目标thrift port """ if not isinstance(table_name, bytes): table_name = bytes(table_name, encoding='utf-8') # 建立 thrift 连接 transport = TSocket.TSocket(ip, server_port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = Hbase.Client(protocol) transport.open() result = [] count = 0 for item in data: count += 1 mutations = [] img_type = bytes(item['img_type'], encoding='utf-8') row_key = bytes(hashlib.md5(item['url'].encode()).hexdigest(), encoding='utf-8') mutations.append(Mutation(column=b'info:img_type', value=img_type)) result.append(Hbase.BatchMutation(row=row_key, mutations=mutations)) client.mutateRows(table_name, result, None) transport.close()
def puts(self, rowkey, columnFamilies, values): mutationsBatch = [] try: if not isinstance(rowkey, list): rowKeys = [rowkey] * len(values) for i, value in enumerate(values): mutations = [] for j, column in enumerate(value): if isinstance(column, str): m_name = Hbase.Mutation(column=columnFamilies[j] + ':' + '0', value=column) elif isinstance(column, int): m_name = Hbase.Mutation(column=columnFamilies[j] + ':' + '0', value=encode(column)) mutations.append(m_name) mutationsBatch.append( Hbase.BatchMutation(row=rowKeys[i], mutations=mutations)) self.client.mutateRows(self.dbname, mutationsBatch) return True except (Hbase.IOError, Hbase.TException, Hbase.TApplicationException, Hbase.IllegalArgument) as e: logInfo('puts') logInfo(e) print(e) return False
def write_data_to_hbase(data, col_names, table_name, ip, server_port): """ 该函数为在mapPartation中调用的功能函数。接受的RDD数据以迭代器的形式传入。 通过遍历迭代器,将迭代器中的数据缓冲到一个缓冲变量中。 当缓冲变量中的数据量到达1000条时,将数据推送到hbase中,然后清空变量,姐搜下一批数据。 :param data: 包含数据的迭代器。 :param col_names: 需要推送的列的列名 :param table_name: 需要推送的目标表的表名 :param ip: 推送的目标thrift ip :param server_port: 推送的目标thrift port :return: 每一行对应的缓冲变量的索引编号 """ print("start putDataAsPartition") if not isinstance(table_name, bytes): table_name = bytes(table_name, encoding='utf-8') col_names = HBaseUtils().str_list_to_bytes_list(col_names) # 建立hbase连接 transport = TSocket.TSocket(ip, server_port) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = Hbase.Client(protocol) transport.open() # 开始收集数据 result = [] return_data = [] count = 0 for line in data: # print("data: " + str(line)) count += 1 # 收集数据生成BathMutation mutations_ = [] for colName in col_names: if str(colName, encoding='utf-8') in line: mutations_.append(Mutation(column=colName, value=bytes(line[str(colName, encoding='utf-8')], encoding='utf-8'))) result.append(Hbase.BatchMutation(row=bytes(line["rowKey"], encoding='utf-8'), mutations=mutations_)) # 每1000条想hbase推送一次数据 if count % 1000 == 0: client.mutateRows(table_name, result, None) result = [] # 推送出缓冲变量中的剩余数据 if len(result) > 0: client.mutateRows(table_name, result, None) transport.close() return return_data
def puts(self, rowKeys, values, qualifier='1'): mutationsBatch = [] if not isinstance(rowKeys, list): rowKeys = [rowKeys] * len(values) for i, value in enumerate(values): mutations = [] for j, column in enumerate(value): if isinstance(column, str): m_name = Hbase.Mutation(column=self.columnFamilies[j] + ':' + qualifier, value=column) elif isinstance(column, int): m_name = Hbase.Mutation(column=self.columnFamilies[j] + ':' + qualifier, value=encode(column)) mutations.append(m_name) qualifier = str(int(qualifier) + 1) mutationsBatch.append( Hbase.BatchMutation(row=rowKeys[i], mutations=mutations)) self.client.mutateRows(self.table, mutationsBatch, {})
def execute(): mutationsbatch = [] mutations_attributes = {} sock = TSocket.TSocket(thriftServer, thriftPort) transport = TTransport.TSaslClientTransport(sock, thriftServer, saslServiceName) #protocol = TCompactProtocol.TCompactProtocol(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = Hbase.Client(protocol) transport.open() mutations = [ Hbase.Mutation(column="c:coluna1", value='Texto da coluna 1'), Hbase.Mutation(column="c:coluna2", value='Texto da coluna 2') ] row_key = '00001' mutationsbatch.append(Hbase.BatchMutation(row=row_key, mutations=mutations)) client.mutateRows(tablename, mutationsbatch, mutations_attributes) print('OK') del mutations del mutationsbatch mutationsbatch = [] transport.close() del client del protocol del transport del sock
columnFamilies = [] columnFamilies.append(Hbase.ColumnDescriptor(name='info')) columnFamilies.append(Hbase.ColumnDescriptor(name='contact')) columnFamilies.append(Hbase.ColumnDescriptor(name='others')) client.createTable(tableName, columnFamilies) mutationsbatch = [] mutations = [ Hbase.Mutation(column='info:FULLNAME', value='Gustavo Achong'), Hbase.Mutation(column='info:AGE', value='38'), Hbase.Mutation(column='contact:EMAILID', value='*****@*****.**'), Hbase.Mutation(column='contact:PHONE', value='398-555-0132'), Hbase.Mutation(column='others:MODIFIEDDATE', value='5/16/2005 4:33:33 PM') ] mutationsbatch.append(Hbase.BatchMutation(row='1', mutations=mutations)) mutations = [ Hbase.Mutation(column='info:FULLNAME', value='Catherine Abel'), Hbase.Mutation(column='info:AGE', value='36'), Hbase.Mutation(column='contact:EMAILID', value='*****@*****.**'), Hbase.Mutation(column='contact:PHONE', value='747-555-0171'), Hbase.Mutation(column='others:MODIFIEDDATE', value='5/16/2005 4:33:33 PM') ] mutationsbatch.append(Hbase.BatchMutation(row='2', mutations=mutations)) mutations = [ Hbase.Mutation(column='info:FULLNAME', value='Kim Abercrombie'), Hbase.Mutation(column='info:AGE', value='38'), Hbase.Mutation(column='contact:EMAILID',
def puts(self, records, job_id): """ hbase批量插入 :param records: 多条条记录list,一条记录格式为{'_id':'','field1':'', 'field2':''} :param job_id: 任务类型,比如 'mongodb:hb_charts' :return: """ assert isinstance(records, list) row_name = '' # 行的ID log_column = '' # 记下的列,比如 update_at 列 if job_id.split(':')[0] == 'mongodb': row_name = '_id' log_column = 'last_updated' elif job_id.split(':')[0] == 'mysql': row_name = 'id' log_column = 'update_at' mutations_batch = [] for record in records: mutations = [] # row_key的值为 md5(_id)[0:10]:_id _id = str(record[row_name]) row_key = bytes( hashlib.md5(bytes(_id, encoding="utf-8")).hexdigest()[0:10] + ':' + _id, encoding="utf-8") for item in record: if item == row_name: continue key = bytes('data:' + item, encoding="utf8") var = bytes(str(record[item]), encoding="utf8") # hbase.client.keyvalue.maxsize 默认是10M,超出这个值则设置为None if len(var) < 10 * 1024 * 1024: mutations.append(Hbase.Mutation(column=key, value=var)) else: mutations.append( Hbase.Mutation(column=key, value=bytes(str(None), encoding="utf8"))) mutations_batch.append( Hbase.BatchMutation(row=row_key, mutations=mutations)) self.client.mutateRows(self.table, mutations_batch, {}) self.put_num += len(mutations_batch) with self.file_lock: f = open(job_id + '.txt', 'w') json = dict({ 'date': '', 'job_id': '', 'id': '', 'update': '', 'number': '' }) json['date'] = time.strftime('%Y-%m-%d %H:%M:%S') json['job_id'] = job_id json['id'] = records[-1][row_name] if job_id.split(':')[0] == 'mongodb': json['update'] = records[-1][log_column] elif job_id.split(':')[0] == 'mysql': json['update'] = records[-1][log_column].strftime( '%Y-%m-%d %H:%M:%S') json['number'] = str(self.put_num) f.write(str(json)) f.close()
for filename in os.listdir(sourceDir): shakespeare = open(os.path.join(sourceDir, filename), "rb") linenumber = 0 # Create a list of mutations per work of Shakespeare mutationsbatch = [] for line in shakespeare: rowkey = username + "-" + filename + "-" + str(linenumber).zfill(6) # Create an array containing all values for the Column Descriptors mutations = [ Hbase.Mutation(column=messagecolumncf, value=line.strip()), Hbase.Mutation(column=linenumbercolumncf, value=encode(linenumber)), Hbase.Mutation(column=usernamecolumncf, value=username) ] # Add the new mutations to the MutationsBatch list mutationsbatch.append( Hbase.BatchMutation(row=rowkey, mutations=mutations)) linenumber = linenumber + 1 # Run the mutations for the work of Shakespeare client.mutateRows(tablename, mutationsbatch) transport.close()