def getBatchMutations(cf, qualifiers, tups): batchMutations = [] for tup in tups: mutations = [] for i in range(1, len(qualifiers)): mutation = Mutation(column="%s:%s" % (cf, qualifiers[i]), value=tup[i]) mutations.append(mutation) batchMutation = BatchMutation(tup[0], mutations) batchMutations.append(batchMutation) return batchMutations
def writeAll(self, infos): """ same as write, but write multi lines return:None NOTE:all Mutation using one timestamp, use the bigest line's timestamp """ if infos is None: raise ValueError, "info is None" rowBatches = [] theBigestTime = None for info in infos: batchMutation = BatchMutation() (key, value) = info.split('|', 2) kvMap = {} for kv in key.split('`'): (k, v) = kv.split('=') kvMap[k] = v jkid = kvMap['jkid'] jktime = kvMap['time'] keyTime = jktime[0:12] rowKey = "%s_%s" % (jkid, keyTime) batchMutation.row = rowKey #print rowKey cols=[] for kv in value.split('`'): k, v = kv.split('=') colName = "info:%s" % k col = Mutation(column=colName, value=v) cols.append(col) batchMutation.mutations = cols if cmp(theBigestTime, jktime) < 0: theBigestTime = jktime rowBatches.append(batchMutation) if theBigestTime is not None: timestamp = time.mktime(time.strptime(theBigestTime, "%Y%m%d%H%M%S")) self.client.mutateRowsTs(self.tableName, rowBatches, timestamp)
def insert_rows_data(self, table_name, data): allmutation = [] for row in data: if data[row]: one_row = [] for column in data[row]: mutation = Mutation(column=column, value=json.dumps(data[row][column])) one_row.append(mutation) batchMutation = BatchMutation(row, one_row) allmutation.append(batchMutation) if (len(allmutation) > 0): try: self.client.mutateRows(table_name, allmutation) return True except Exception as e: if (self.exception): raise e # 抛出异常让外部处理 logging.error('insert_rows_data data error : %s' % e) return False logging.info('insert data null') return False
def input_to_es(table, ops): client, transport = get_client(table, ops.location, ops.port) transport.open() rows = [] print 'Begin to write date to table, from %s to %d months later' \ % (ops.start, int(ops.months)) for date in get_dates(ops.start, int(ops.months)): for provider, no in PROVIDERIDs: rowkey = date.replace('/', '') + no provider_id = Mutation(column='info:logisticproviderid', value=provider) et = random.randint(100000, 99999999) exp_total = Mutation(column='exp:total', value=str(et)) exp_error = Mutation(column='exp:error', \ value=str(random.randint(100, 999))) expstate_total = Mutation(column='expstate:total', value=str(et * 10)) expstate_error = Mutation(column='expstate:error', \ value=str(random.randint(100, 999))) rows.append(BatchMutation(rowkey, [provider_id, exp_total, \ exp_error, expstate_total, expstate_error])) client.mutateRows(table, rows, {}) transport.close() print 'Write to ExpressStatistics finished'
column01 = ColumnDescriptor( name='user_info' ) # ColumnDescriptor(bloomFilterType='NONE', bloomFilterNbHashes=0, name='user_info', maxVersions=3, blockCacheEnabled=False, inMemory=False, timeToLive=-1, bloomFilterVectorSize=0, compression='NONE') column02 = ColumnDescriptor('addr_info') #hbase好像不支持使用Python创建预分区表 # client.createTable('tablepy',[column01,column02]) # print(client) region_info = client.getTableRegions('tablepy') #查看表分区 table_info = client.getColumnDescriptors('tablepy') #查看表结构 print( region_info ) # [TRegionInfo(startKey='', endKey='', version=1, id=1543752131747L, name='tablepy,,1543752131747.ccfa71e67b9732adb575129bf9e560eb.')] print( table_info ) # {'addr_info:': ColumnDescriptor(bloomFilterType='NONE', bloomFilterNbHashes=0, name='addr_info:', maxVersions=3, blockCacheEnabled=False, inMemory=False, timeToLive=2147483647, bloomFilterVectorSize=0, compression='NONE'), 'user_info:': ColumnDescriptor(bloomFilterType='NONE', bloomFilterNbHashes=0, name='user_info:', maxVersions=3, blockCacheEnabled=False, inMemory=False, timeToLive=2147483647, bloomFilterVectorSize=0, compression='NONE')} #插入数据 mutation = Mutation(column='user_info:province', value='350000') batchs = BatchMutation('row02', [mutation]) insert_resut = client.getRow('tablepy', 'row01') #插入多条数据 # client.mutateRow('tablepy','row01',[mutation]) client.mutateRows( 'tablepy', [batchs]) #这个方法与上面mutateRow的区别在于,mutateRows可以一次插入多条记录,而mutateRow只能插入单条数据 print( insert_resut ) # [TRowResult(columns={'user_info:province': TCell(timestamp=1543752270954L, value='350000')}, row='row01')] client.deleteAll('tablepy', 'row01', 'addr_info') #删除指定指定行指定列的数据 client.deleteAllRow('tablepy', 'row01') #删除指定行的全部数据 socket.close() #用完要记得关闭
def data_shuffle(self, mongo_data_list, province_list, city_list, area_list): batch_list = list() for city in city_list: if city["NAME_"] == "县": city_list.remove(city) for data in mongo_data_list: mutation_list = list() # print(data) prov_n = None prov_c = None city_n = None city_c = None area_n = None area_c = None # 省级字段 for prov in province_list: if prov["NAME_"] == data["AREA_"][:len(prov["NAME_"])]: prov_n = prov["NAME_"] prov_c = prov["CODE_"] # todo 乱码 # if prov_c is None: # print(data) # 市级字段 for city in city_list: if city["PARENT_"] == prov_c: if city["NAME_"] in data["AREA_"][:len(prov_n) + len(city["NAME_"])]: city_n = city["NAME_"] city_c = city["CODE_"] for area in area_list: if area["PARENT_"] == city_c: if area["NAME_"] in data["AREA_"]: area_n = area["NAME_"] area_c = area["CODE_"] elif area["NAME_"][:2] in data["AREA_"]: area_n = area["NAME_"] area_c = area["CODE_"] index = data["AREA_"].find(area["NAME_"][:2]) data["AREA_"] = data["AREA_"].replace( data["AREA_"][index:], area["NAME_"]) if area_n is None: # 石家庄市桥东区被合并,数据库无匹配 if data["AREA_"] == "河北省石家庄市桥东区": area_n = "桥西区" area_c = "130104" else: for area in area_list: # 省直辖县级市 todo 添加条件 area["PARENT_"][:2] == city_c[:2] and if area["PARENT_"][-4:] == "9000": if area["NAME_"] in data["AREA_"]: city_n = "省直辖县级行政区划" city_c = prov_c[:2] + "9000" area_n = area["NAME_"] area_c = area["CODE_"] if area_n is None: area_n = city_n area_c = area_c # 乱码跳过此次循环 if area_n is None: print(data) continue # 地址清洗 if "中国" in data["ADDR_"][:2]: data["ADDR_"] = data["ADDR_"].replace("中国", "") if data["ADDR_"] == "": data["ADDR_"] = data["AREA_"] if "电话" in data["ADDR_"]: index = data["ADDR_"].find("电话") data["ADDR_"] = data["ADDR_"].replace(data["ADDR_"][index:], "") if prov_n + prov_n[:-1] in data["ADDR_"]: data["ADDR_"] = data["ADDR_"].replace(prov_n + prov_n[:-1], prov_n) if prov_n not in data["ADDR_"][:len(prov_n)]: if prov_n[:2] in data["ADDR_"][:len(prov_n)]: data["ADDR_"] = data["ADDR_"][:len(prov_n)].replace( prov_n[:2], prov_n) + data["ADDR_"][len(prov_n):] else: data["ADDR_"] = prov_n + data["ADDR_"] if city_n not in data["ADDR_"][:len(prov_n) + len(city_n)]: if city_n[:-1] in data["ADDR_"][:len(prov_n) + len(city_n)]: data["ADDR_"] = data[ "ADDR_"][:len(prov_n) + len(city_n)].replace( city_n[:-1], city_n) + data["ADDR_"][len(prov_n) + len(city_n):] data["ADDR_"] = data["ADDR_"].replace("市县", "市") else: if city_c[-4:] != "9000": data["ADDR_"] = data["ADDR_"][:len( prov_n)] + city_n + data["ADDR_"][len(prov_n):] else: if area_n in data["ADDR_"][:len(prov_n) + len(area_n)]: pass else: if area_n in data["ADDR_"]: index = data["ADDR_"].find(area_n) data["ADDR_"] = prov_n + data["ADDR_"][index:] else: data["ADDR_"] = data[ "ADDR_"][:len(prov_n)].replace( prov_n, "") + data["ADDR_"][len(prov_n):] data["ADDR_"] = data[ "ADDR_"][:len(city_n)].replace( city_n, "") + data["ADDR_"][len(city_n):] index = data["ADDR_"].find(area_n) - len( area_n) data["ADDR_"] = prov_n + data["AREA_"][ index:] + data["ADDR_"] if city_n in data["ADDR_"][:len(prov_n) + len(city_n)]: data["ADDR_"] = data[ "ADDR_"][:len(prov_n) + len(city_n)].replace( city_n, "") + data["ADDR_"][len(prov_n) + len(city_n):] # print(data["ADDR_"]) addr_ = data["ADDR_"] # 定义HBase_row deal_time = int(float(data["DEALTIME_"])) row_time = 9999999999 - deal_time row = str(data["ENTITY_CODE_"]) + "_" + str(row_time) # 状态列字段 mutation_s = Mutation(column="{}:{}".format("S", "STATUS_"), value="1") mutation_list.append(mutation_s) # 创建时间 mutation_creat_time = Mutation(column="{}:{}".format( "C", "CREATE_TIME_"), value=str(data["DATETIME_"])) mutation_list.append(mutation_creat_time) # 地区编码 mutation_area_C = Mutation(column="{}:{}".format( "C", "AREA_CODE_"), value=area_c) mutation_list.append(mutation_area_C) # 学校名称 mutation_name = Mutation(column="{}:{}".format("F", "NAME_"), value=str(data["NAME_"])) mutation_list.append(mutation_name) # 学校图片 mutation_url = Mutation(column="{}:{}".format("F", "IMAGES_"), value=str(data["IMAGES_"])) mutation_list.append(mutation_url) # 学校级别(是否公办) mutation_url = Mutation(column="{}:{}".format("F", "GRADE_"), value=str(data["GRADE_"])) mutation_list.append(mutation_url) # 省级编码 mutation_p_c = Mutation(column="{}:{}".format( "F", "PROVINCE_CODE_"), value=prov_c) mutation_list.append(mutation_p_c) # 省级名称 mutation_p_n = Mutation(column="{}:{}".format( "F", "PROVINCE_NAME_"), value=prov_n) mutation_list.append(mutation_p_n) # 市级编码 mutation_c_c = Mutation(column="{}:{}".format("F", "CITY_CODE_"), value=city_c) mutation_list.append(mutation_c_c) # 市级名称 mutation_c_n = Mutation(column="{}:{}".format("F", "CITY_NAME_"), value=city_n) mutation_list.append(mutation_c_n) # 区县编码 mutation_area_c = Mutation(column="{}:{}".format( "F", "DISTRICT_CODE_"), value=area_c) mutation_list.append(mutation_area_c) # 区县名称 mutation_area_n = Mutation(column="{}:{}".format( "F", "DISTRICT_NAME_"), value=area_n) mutation_list.append(mutation_area_n) # 学校性质(公办民办私立) mutation_addr = Mutation(column="{}:{}".format( "F", "SCHOOL_TYPE_"), value=str(data["SCHOOL_TYPE_"])) mutation_list.append(mutation_addr) # 学校等级 mutation_addr = Mutation(column="{}:{}".format("F", "PERIOD_"), value=str(data["PERIOD_"])) mutation_list.append(mutation_addr) # 学校电话 mutation_addr = Mutation(column="{}:{}".format("F", "TEL_"), value=str(data["TEL_"])) mutation_list.append(mutation_addr) # 学校网站 mutation_addr = Mutation(column="{}:{}".format("F", "WEBSITE_"), value=str(data["WEBSITE_"])) mutation_list.append(mutation_addr) # 学校地址 mutation_addr = Mutation(column="{}:{}".format("F", "ADDR_"), value=addr_) mutation_list.append(mutation_addr) # 学校简介 mutation_addr = Mutation(column="{}:{}".format("F", "BRIEF_"), value=str(data["BRIEF_"])) mutation_list.append(mutation_addr) # 页面地址 mutation_addr = Mutation(column="{}:{}".format("F", "URL_"), value=str(data["URL_"])) mutation_list.append(mutation_addr) # 处理时间 mutation_addr = Mutation(column="{}:{}".format("F", "DEALTIME_"), value=str(data["DEALTIME_"])) mutation_list.append(mutation_addr) # 实体名称 mutation_addr = Mutation(column="{}:{}".format( "F", "ENTITY_NAME_"), value=str(data["ENTITY_NAME_"])) mutation_list.append(mutation_addr) # 实体编码 mutation_addr = Mutation(column="{}:{}".format( "F", "ENTITY_CODE_"), value=str(data["ENTITY_CODE_"])) mutation_list.append(mutation_addr) # MongoDB_id mutation_id = Mutation(column="{}:{}".format("F", "_id"), value=str(data["_id"])) mutation_list.append(mutation_id) batch_mutation = BatchMutation(row, mutation_list) batch_list.append(batch_mutation) return batch_list