def deal_quan_shop(city,date,all_shopid_list,cate_list): ''' 处理店圈店铺的基本数据 :param city: 店圈所属城市 :param date: 数据分析日期 :param all_shopid_list: 店圈数据的店铺列表的数据 :param cate_list: 店圈数据的品类列表 :return: ''' sql = """ select t1.rest_id,t1.rest_name,t1.delivery_id,t1.longitude,t1.latitude,t1.is_new from t_e_rest_list_city_1712 t1 where t1.city = '%s' and t1.date = '%s' and t1.rest_id IN %s """%(city,date,all_shopid_list) sql2 = "insert into quan_shop values" batch = BatchSql(sql2) result = db2.queryForList(sql, None) print(result) for item in result: for cate in cate_list: if item[0] == cate[0]: quan_shop = [] quan_shop.extend([0,item[0],1,date,item[1],item[2],item[3],item[4],item[5],cate[1],time,time]) print(quan_shop) batch.addBatch(quan_shop) if batch.getSize() > 1000: db1.update(batch) batch.cleanBatch() db1.update(batch)
def deal_quan_monitor_activity_data(city,date,all_shopid_list): ''' 处理店圈店铺的活动信息 :param city: 所属城市 :param date: 数据日期 :param all_shopid_list: 店圈店铺的ID列表 :return: ''' sql=""" select t1.rest_id,t1.description,t1.active_type from t_e_rest_active_city_1712 t1 where t1.city = '%s' and t1.date = '%s' and t1.rest_id IN %s """%(city,date,all_shopid_list) sql2 = "insert into quan_monitor_activity_data values" batch = BatchSql(sql2) result = db2.queryForList(sql, None) print(result) for shoplist in all_shopid_list: all_activity = [] quan_monitor_activity_data = [] for item in result: activity = dict(type=item[2],content=item[1]) if item[0] == shoplist: all_activity.append(activity) all_activity = json.dumps(all_activity) quan_monitor_activity_data.extend([0,date,shoplist,str(all_activity),time,time]) print(quan_monitor_activity_data) batch.addBatch(quan_monitor_activity_data) if batch.getSize() > 10000: db1.update(batch) batch.cleanBatch() db1.update(batch)
def __init__(self,resource_path, city, date,save_date_short, err_rest_ids,db): ''' :param resource_path: 城市数据文件路径 :param city: 城市拼音 :param date: 保存日期 :param save_date_short:保存日期简写 :param err_rest_ids: 错误店铺ID :param db: 数据库配置文件 ''' self.resource_path = resource_path self.city = city self.date = date self.save_date_short = save_date_short self.err_rest_ids = err_rest_ids self.db = db self.sql = "insert into t_e_rest_score_city_%s values" %save_date_short self.batch = BatchSql(self.sql)
def __init__(self, resource_path, city, date, rest_area, err_rest_ids, db, is_test=False): ParseDataObject.__init__(self, resource_path, is_test) self.err_rest_ids = err_rest_ids self.city = city self.date = date self.rest_area = rest_area self.db = db sql = "insert into t_e_rest_score_import values" self.batch1 = BatchSql(sql)
class ParseDeliveymode(object): def __init__(self,resource_path, city, date,save_date_short,db): ''' :param resource_path: 城市数据文件路径 :param city: 城市拼音 :param date: 保存日期 :param save_date_short:保存日期简写 :param db: 数据库配置信息 ''' self.resource_path = resource_path self.city = city self.date = date self.save_date_short = save_date_short self.db = db self.sql = "insert into t_e_delivery_mode_city_pre_%s values" % save_date_short self.batch = BatchSql(self.sql) def parse_data(self, data): try: rest_id = str(data['param']) if len(rest_id) <= 11: data = data['data'].get('delivery_mode') delever_id = getMapValue(data,"id") delever_text = getMapValue(data, "text") param = [self.city, self.date,rest_id,delever_text,delever_id] self.batch.addBatch(param) if self.batch.getSize() > 100000: self.db.update(self.batch) self.batch.cleanBatch() else: print("错误数据", data) except Exception as e: print(traceback.print_exc()) print('解析数据报错\n错误数据:{}'.format(data)) def run(self): f_name = os.path.join(self.resource_path, 'rest_info.pickle') print("解析文件:", f_name) with open(f_name, "rb+") as f: while True: try: data = pickle.load(f) self.parse_data(data) except EOFError as e: print("文件读取完成", e) break except Exception as e2: print("报错", e2) break if self.batch.getSize() > 0: self.db.update(self.batch) self.batch.cleanBatch()
def deal_quan_monitor_hot_sku_data(city,date,all_shopid_list): ''' 处理店圈sku数据,9个店铺大约110万条数据 :param city: 所属城市 :param date: 数据日期 :param all_shopid_list: 店圈店铺ID列表 :return: ''' sql=""" select t1.rest_id,t1.food_name,t1.price,t1.food_id,t1.food_month_sales,t1.has_activity,t2.delivery_fee from t_e_rest_menu_level2_unique_city_nj_1712 t1, t_e_rest_list_city_1712 t2 where t1.city = '%s' and t1.date = '%s' and t1.rest_id IN %s and t1.food_month_sales != 0 and t1.city = t2.city and t1.date = t2.date and t1.rest_id = t2.rest_id """%(city,date,all_shopid_list) sql2 = 'insert into quan_monitor_hot_sku_data values' batch = BatchSql(sql2) result = db2.queryForList(sql, None) print(result) for shop_list in all_shopid_list: for item in result: quan_monitor_hot_sku_data = [] if shop_list == item[0]: if item[5] == 1: #特价菜 quan_monitor_hot_sku_data.extend([0,date,shop_list,item[1],item[2],item[2],item[4],time,time]) print(quan_monitor_hot_sku_data) batch.addBatch(quan_monitor_hot_sku_data) elif item[5] == 0: #满减菜品 sub_price = get_sub_price(city,date,shop_list,item[2]) if sub_price == None: sub_price = 0 true_price = item[2]-sub_price quan_monitor_hot_sku_data.extend([0,date,shop_list,item[1],item[2],true_price,item[4],time,time]) print(quan_monitor_hot_sku_data) batch.addBatch(quan_monitor_hot_sku_data) if batch.getSize() > 10000: db1.update(batch) batch.cleanBatch() db1.update(batch)
def deal_TrafficStats(self): ''' 处理数据流量分析的信息:店铺id、时间、曝光人数,访问人数,下单人数,访问转化率,下单转化率 :return: ''' TrafficStats = [] exposureNum = self.TrafficStats['data']['flowGeneralInfoVo'][ 'exposureNum'] visitNum = self.TrafficStats['data']['flowGeneralInfoVo']['visitNum'] orderNum = self.TrafficStats['data']['flowGeneralInfoVo']['orderNum'] visitRate = self.TrafficStats['data']['flowGeneralInfoVo']['visitRate'] orderRate = self.TrafficStats['data']['flowGeneralInfoVo']['orderRate'] TrafficStats.extend([ self.shopid, self.date, exposureNum, visitNum, orderNum, visitRate, orderRate ]) print('流量分析信息:', TrafficStats) sql = "insert into meituan_backstage VALUES " batch = BatchSql(sql) batch.addBatch(TrafficStats) self.db.update(batch)
def deal_meituan_customer_prefer_analysis(self): ''' 处理同行顾客爱好:id,店铺id,店铺名称,日期 同行顾客爱好的菜品名,该菜品的同行百分比 :return: ''' customer_prefer_analysi = self.customerPerfer['data']['pubCusEat'] for item in customer_prefer_analysi: meituan_customer_prefer_analysi = [] customer_prefer_name = item[0] customer_prefer_percent = item[1] meituan_customer_prefer_analysi.extend([ 0, self.shopid, self.shopname, str(self.yesterday), customer_prefer_name, float(customer_prefer_percent) ]) print('同行顾客爱好:', meituan_customer_prefer_analysi) sql = "insert into meituan_customer_prefer_analysis VALUES" batch = BatchSql(sql) batch.addBatch(meituan_customer_prefer_analysi) self.db2.update(batch)
def deal_meituan_customer_precision_operation(self): ''' 处理精准营销的数据:id,店铺id,店铺名称,日期, 营销名称,营销活动人数 :return: ''' customer_precision_labelList = self.getCouponLabel['data']['labelList'] for item in customer_precision_labelList: customer_precision_operation = [] precision_operation_name = item['title'] precision_operation_numbers = item['nums'] customer_precision_operation.extend([ 0, self.shopid, self.shopname, str(self.yesterday), precision_operation_name, precision_operation_numbers ]) print('精准营销的活动信息:', customer_precision_operation) sql = "insert into meituan_customer_precision_operation VALUES" batch = BatchSql(sql) batch.addBatch(customer_precision_operation) self.db2.update(batch)
def deal_CustomerReminder(self): ''' 处理店铺催单信息:id,店铺id,日期,时间,平台名称,催单次数,催单时间 :return: ''' CustomerReminder_Info = self.customer_reminderInfo['wmOrderList'] CustomerReminder_Data = self.customer_reminder['data'] if CustomerReminder_Info != None: for item in CustomerReminder_Info: #获取另一个催单账号信息借口获取催单的ID CustomerReminder_ID = item['wm_order_id_view'] try: for item in CustomerReminder_Data[str( CustomerReminder_ID)]: #根据催单ID获取相应的催单的时间信息 times = len(CustomerReminder_Data[str( CustomerReminder_ID)]) #催单的次数 CustomerReminder = [] reminder_time_fmt = item['reminder_time_fmt'] response_time_fmt = item['response_time_fmt'] reminder_time_fmt = str(reminder_time_fmt).split( ':')[1] response_time_fmt = str(response_time_fmt).split( ':')[1] reminder_time = int(response_time_fmt) - int( reminder_time_fmt) #商家后台恢复催单信息的时长 CustomerReminder.extend([ 0, self.shopid, self.date, self.time, CustomerReminder_ID, 2, times, reminder_time ]) print('客户催单信息:', CustomerReminder) sql = "insert into daily_customer_reminder VALUES" batch = BatchSql(sql) batch.addBatch(CustomerReminder) self.db.update(batch) except Exception: pass
def deal_Activitylist(self): ''' 处理店铺活动的信息:id,店铺id,活动id,时间,平台名称,活动类型,活动名称,活动内容 :return: ''' Activity_onGoingActs = self.activitylist['data']['onGoingActs'] for item in Activity_onGoingActs: Activitylist = [] actId = item['actId'] actName = item['actName'] poiPolicy = item['poiPolicy'] type = item['type'] Activity_type = self.deal_Activity_type(type) Activitylist.extend([ 0, self.shopid, actId, self.date, 2, Activity_type, actName, poiPolicy ]) print('店铺活动信息:', Activitylist) sql = "insert into daily_shop_activity_data VALUES" batch = BatchSql(sql) batch.addBatch(Activitylist) self.db3.update(batch)
def deal_quan_monitor_shop_data(city,date,all_shopid_list): ''' 处理店圈的配送费用和配送时间的信息 :param city: 店圈所属城市 :param date: 爬取数据的日期 :param all_shopid_list: 所有店圈的店铺列表信息 :return: ''' sql = """ select t1.rest_id,t1.order_month_sales,t2.deliver_time,t1.delivery_fee,t1.min_delivery_price,t2.overall_score from t_e_rest_list_city_1712 t1, t_e_rest_score_city_1712 t2 where t1.city = '%s' and t1.date = '%s' and t1.rest_id IN %s and t1.rest_id = t2.rest_id and t1.city = t2.city and t1.date = t2.date """%(city,date,all_shopid_list) sql2 = "insert into quan_monitor_shop_data values" batch = BatchSql(sql2) result = db2.queryForList(sql, None) for item in result: quan_monitor_shop_data = [] score = float('%.1f'%float(item[5])) quan_monitor_shop_data.extend([0,date,item[0],item[1],item[2],item[3],item[4],score,time,time]) print(quan_monitor_shop_data) batch.addBatch(quan_monitor_shop_data) if batch.getSize() > 10000: db1.update(batch) batch.cleanBatch() db1.update(batch)
class ParseHotword(object): def __init__(self, resource_path, city, date, save_date_short, err_rest_ids, db): ''' :param resource_path: 城市数据文件路径 :param city: 城市拼音 :param date: 保存日期 :param save_date_short:保存日期简写 :param err_rest_ids: 错误店铺ID :param db: 数据库配置信息 ''' self.resource_path = resource_path self.city = city self.date = date self.save_date_short = save_date_short self.err_rest_ids = err_rest_ids self.db = db self.sql = "insert into t_e_hot_search_word_city_%s values" % save_date_short self.batch = BatchSql(self.sql) def parse_data(self, data): try: rest_id = str(data["param"][0]) if rest_id in self.err_rest_ids: return if len(rest_id) <= 11: la = data["param"][1] lo = data["param"][2] for item in data["data"]: search_word = getMapValue(item, "search_word") if search_word != '-999': self.batch.addBatch([ self.city, self.date, rest_id, la, lo, search_word ]) if self.batch.getSize() > 100000: self.db.update(self.batch) self.batch.cleanBatch() else: print("错误数据", data) print('错误数据\n{}'.format(data)) except Exception as e: print("解析数据报错", e) print('解析数据报错\n错误数据:{}'.format(data)) def run(self): f_name = os.path.join(self.resource_path, 'hot_word.pickle') print("解析文件:", f_name) with open(f_name, "rb+") as f: while True: try: data = pickle.load(f) self.parse_data(data) except EOFError as e: print("文件读取完成", e) break except Exception as e2: print("报错", e2) break if self.batch.getSize() > 0: self.db.update(self.batch) self.batch.cleanBatch()
class ParseCategoryObject(ParseDataObject): """ category.pickle数据解析 """ def __init__(self, resource_path, city, date, rest_area, db, is_test=False): ParseDataObject.__init__(self, resource_path, is_test) self.err_rest_ids = set() self.rest_category = defaultdict(set) self.city = city self.date = date self.rest_area = rest_area self.db = db sql1 = "insert into t_e_rest_list_import values" sql2 = "insert into t_e_rest_active_import values" sql3 = "insert into t_e_rest_category_import values" sql4 = "insert into t_e_rest_open_time_import values" sql5 = "insert into t_e_rest_money_off_import values" sql6 = "insert into t_e_rest_money_off_avg_import values" self.batch1 = BatchSql(sql1) self.batch2 = BatchSql(sql2) self.batch3 = BatchSql(sql3) self.batch4 = BatchSql(sql4) self.batch5 = BatchSql(sql5) self.batch6 = BatchSql(sql6) def parse(self, generator=None): generator = generator if generator else self.GeneratorPickleData() try: for data in generator: self.parse_data(data) self.insertAll() except: print("解析数据报错:", traceback.print_exc()) def insert(self, batch, param): batch.addBatch(param) if batch.getSize() > 100000: self.db.update(batch) batch.cleanBatch() def insertAll(self): if self.batch1.getSize() > 0: self.db.update(self.batch1) self.batch1.cleanBatch() if self.batch2.getSize() > 0: self.db.update(self.batch2) self.batch2.cleanBatch() if self.batch3.getSize() > 0: self.db.update(self.batch3) self.batch3.cleanBatch() if self.batch4.getSize() > 0: self.db.update(self.batch4) self.batch4.cleanBatch() if self.batch5.getSize() > 0: self.db.update(self.batch5) self.batch5.cleanBatch() if self.batch6.getSize() > 0: self.db.update(self.batch6) self.batch6.cleanBatch() def parse_data(self, data): try: category_item = data['param'][2] category_level2_id = str(category_item.get("id2")) item = data["data"] rest_id = getMapValue(item, "id") lat = float(getMapValue(item, "latitude")) lng = float(getMapValue(item, "longitude")) if len(rest_id) > 11: print('错误店铺id\nrest_id: {}'.format(rest_id)) # elif not (self.polygonHellp.is_location_in_polygon(lng, lat)): # self.err_rest_ids.add(rest_id) # print('错误地址\nrest_id: {}, lat: {}, lng: {}, name: {}\n' # 'address: {}'.format(rest_id, lat, lng, getMapValue(item, "name"), # getMapValue(item, "address"))) else: # 店铺去重 if not self.rest_category[rest_id]: param1 = [ self.city, self.date, self.rest_area, rest_id, getMapValue(item, "name"), getMapValue(item, "phone", '/'), getMapValue(item, "address"), getMapValue(item, "average_cost"), [1, 0][getMapValue(item, "delivery_mode") == '-999'], getMapValue(item, "float_delivery_fee"), getMapValue(item, "float_minimum_order_amount"), [0, 1][getMapValue(item, "is_new") == 'True'], [0, 1][getMapValue(item, "is_premium") == 'True'], lat, lng, getMapValue(item, "recent_order_num"), '-999' ] self.insert(self.batch1, param1) opening_hours = getMapValue(item, "opening_hours") arr = json.loads(opening_hours.replace("\'", "\"")) if not isinstance(arr, list): print('错误数据:', rest_id, opening_hours) else: for s in arr: times = s.split("/") param4 = [ self.city, self.date, self.rest_area, rest_id, *times ] self.insert(self.batch4, param4) # TODO 平均满减折扣有问题 discount_rate_list = [] for active in item.get('activities'): attribute = getMapValue(active, "attribute") active_type = getMapValue(active, "icon_name") param2 = [ self.city, self.date, self.rest_area, rest_id, getMapValue(active, "description"), active_type ] self.insert(self.batch2, param2) if active_type == '减' and attribute != '-999': arr = json.loads(attribute) for key, value in arr.items(): if isinstance(value, dict): sub_price = value.get("1") if sub_price == 0: sub_price = value.get("0") else: sub_price = value key = round(float(key), 2) sub_price = round(float(sub_price), 2) discount_rate = round(sub_price / key, 4) discount_rate_list.append(discount_rate) param5 = [ self.city, self.date, self.rest_area, rest_id, key, sub_price, discount_rate ] self.insert(self.batch5, param5) # TODO 平均满减折扣有问题 if len(discount_rate_list): avg_discount_rate = sum(discount_rate_list) / len( discount_rate_list) avg_discount_rate = round(avg_discount_rate, 4) param6 = [ self.city, self.date, self.rest_area, rest_id, avg_discount_rate ] self.insert(self.batch6, param6) # 店铺对应品类去重 if category_level2_id not in self.rest_category[rest_id]: self.rest_category[rest_id].add(category_level2_id) category_level1_id = category_item.get('id1') category_level1_name = category_item.get('name') category_level2_name = category_item.get('name2') param3 = [ self.city, self.date, self.rest_area, category_level1_id, category_level1_name, category_level2_id, category_level2_name, rest_id, getMapValue(item, "name") ] self.insert(self.batch3, param3) except Exception: # print("解析数据报错", traceback.format_exc()) print('解析数据报错\n错误数据:{}'.format(data))
def __init__(self, resource_path, city, date, rest_area, db, is_test=False): ParseDataObject.__init__(self, resource_path, is_test) self.err_rest_ids = set() self.rest_category = defaultdict(set) self.city = city self.date = date self.rest_area = rest_area self.db = db sql1 = "insert into t_e_rest_list_import values" sql2 = "insert into t_e_rest_active_import values" sql3 = "insert into t_e_rest_category_import values" sql4 = "insert into t_e_rest_open_time_import values" sql5 = "insert into t_e_rest_money_off_import values" sql6 = "insert into t_e_rest_money_off_avg_import values" self.batch1 = BatchSql(sql1) self.batch2 = BatchSql(sql2) self.batch3 = BatchSql(sql3) self.batch4 = BatchSql(sql4) self.batch5 = BatchSql(sql5) self.batch6 = BatchSql(sql6)
class ParseCategory(object): def __init__(self, resource_path, city, city_cn, date, date_short, db): ''' :param resource_path: 城市数据文件路径 :param city: 城市拼音 :param city_cn: 城市中文 :param date: 保存日期 :param date_short: 保存日期简写 :param db: 数据配置 :return: ''' self.resource_path = resource_path self.city = city self.city_cn = city_cn self.date = date self.date_short = date_short self.db = db self.sql1 = "insert into t_e_rest_list_city_pre_%s values" % date_short self.sql2 = "insert into t_e_rest_active_city_%s values" % date_short self.sql3 = "insert into t_e_rest_category_city_%s values" % date_short self.sql4 = "insert into t_e_rest_open_time_city_%s values" % date_short self.sql5 = "insert into t_e_rest_money_off_city_%s values" % date_short self.sql6 = "insert into t_e_rest_money_off_avg_city_%s values" % date_short self.batch1 = BatchSql(self.sql1) self.batch2 = BatchSql(self.sql2) self.batch3 = BatchSql(self.sql3) self.batch4 = BatchSql(self.sql4) self.batch5 = BatchSql(self.sql5) self.batch6 = BatchSql(self.sql6) self.err_rest_ids = set() self.rest_category = defaultdict(set) self.category_level1_list = [] self.category_level2_list = [] def get_data_by_level2_id(self, category_level2_id): category_level1_id, category_level1_name, category_level2_name = '-999', '-999', '-999' for item in self.category_level2_list: if category_level2_id == str(item.get('id2')): category_level2_name = str(item.get('name2')) category_level1_id = str(item.get('id1')) category_level1_name = str(item.get('name')) break # for item in category_level1_list: # if category_level1_id == str(item[0]): # category_level1_name = str(item[1]) # break return category_level1_id, category_level1_name, category_level2_id, category_level2_name def parse_data(self, data, m): ''' :param data: 城市数据 :param city: 城市拼音 :param date: 保存日期 :param m: 城市地图信息 :param date_short: 保存日期简写 :param db: 数据库配置信息 :return: ''' try: category_item = data['param'][2] category_level2_id = str(category_item.get("id2")) item = data["data"] rest_id = getMapValue(item, "id") lat = float(getMapValue(item, "latitude")) lng = float(getMapValue(item, "longitude")) if len(rest_id) > 11: print('错误店铺id\nrest_id: {}'.format(rest_id)) elif not (m.is_in_city(lng, lat)): self.err_rest_ids.add(rest_id) print('错误地址\nrest_id: {}, lat: {}, lng: {}, name: {}\n' 'address: {}'.format(rest_id, lat, lng, getMapValue(item, "name"), getMapValue(item, "address"))) else: # 店铺去重 if not self.rest_category[rest_id]: self.batch1.addBatch([ self.city, self.date, rest_id, getMapValue(item, "name"), getMapValue(item, "phone", '/'), getMapValue(item, "address"), getMapValue(item, "average_cost"), [1, 0][getMapValue(item, "delivery_mode") == '-999'], getMapValue(item, "float_delivery_fee"), getMapValue(item, "float_minimum_order_amount"), [0, 1][getMapValue(item, "is_new") == 'True'], [0, 1][getMapValue(item, "is_premium") == 'True'], lat, lng, getMapValue(item, "recent_order_num"), '-999' ]) if self.batch1.getSize() > 10000: self.db.update(self.batch1) self.batch1.cleanBatch() opening_hours = getMapValue(item, "opening_hours") arr = json.loads(opening_hours.replace("\'", "\"")) if not isinstance(arr, list): print('错误数据:', rest_id, opening_hours) print('错误数据\nrest_id: {}, opening_hours: {}'.format( rest_id, opening_hours)) else: for s in arr: times = s.split("/") self.batch4.addBatch( [self.city, self.date, rest_id, *times]) if self.batch4.getSize() > 100000: self.db.update(self.batch4) self.batch4.cleanBatch() # TODO 平均满减折扣有问题 discount_rate_list = [] for active in item.get('activities'): attribute = getMapValue(active, "attribute") active_type = getMapValue(active, "icon_name") self.batch2.addBatch([ self.city, self.date, rest_id, getMapValue(active, "description"), active_type ]) if self.batch2.getSize() > 100000: self.db.update(self.batch2) self.batch2.cleanBatch() if active_type == '减' and attribute != '-999': arr = json.loads(attribute) for key, value in arr.items(): if isinstance(value, dict): sub_price = value.get("1") if sub_price == 0: sub_price = value.get("0") else: sub_price = value key = round(float(key), 2) sub_price = round(float(sub_price), 2) discount_rate = round(sub_price / key, 4) discount_rate_list.append(discount_rate) self.batch5.addBatch([ self.city, self.date, rest_id, key, sub_price, discount_rate ]) if self.batch5.getSize() > 100000: self.db.update(self.batch5) self.batch5.cleanBatch() # TODO 平均满减折扣有问题 if len(discount_rate_list): avg_discount_rate = sum(discount_rate_list) / len( discount_rate_list) avg_discount_rate = round(avg_discount_rate, 4) self.batch6.addBatch( [self.city, self.date, rest_id, avg_discount_rate]) if self.batch6.getSize() > 100000: self.db.update(self.batch6) self.batch6.cleanBatch() # 店铺对应品类去重 if category_level2_id not in self.rest_category[rest_id]: self.rest_category[rest_id].add(category_level2_id) #TODO 取消了单独的品类文件,品类信息和店铺信息保存一起,可直接获取该店铺对应品类信息 # category_level1_id, category_level1_name, category_level2_id, category_level2_name = get_data_by_level2_id(category_level2_id) category_level1_id = category_item.get('id1') category_level1_name = category_item.get('name') category_level2_name = category_item.get('name2') self.batch3.addBatch([ self.city, self.date, category_level1_id, category_level1_name, category_level2_id, category_level2_name, rest_id, getMapValue(item, "name") ]) if self.batch3.getSize() > 100000: self.db.update(self.batch3) self.batch3.cleanBatch() except Exception as e: print("解析数据报错", traceback.format_exc()) print('解析数据报错\n错误数据:{}'.format(data)) def run(self): from ShopMonitor.elm.ElmWebParse.ElmMapUtil import ElmMapUtil m = ElmMapUtil(self.city_cn) f_name = os.path.join(self.resource_path, 'category.pickle') print("解析rest文件:", f_name) with open(f_name, "rb+") as f: while 1: try: data = pickle.load(f) self.parse_data(data, m) # parse_data(data, city, date,m,date_short,db) except EOFError as e: print("rest文件读取完成", e) break except Exception as e2: print("报错", e2) break if self.batch1.getSize() > 0: self.db.update(self.batch1) self.batch1.cleanBatch() if self.batch2.getSize() > 0: self.db.update(self.batch2) self.batch2.cleanBatch() if self.batch3.getSize() > 0: self.db.update(self.batch3) self.batch3.cleanBatch() if self.batch4.getSize() > 0: self.db.update(self.batch4) self.batch4.cleanBatch() if self.batch5.getSize() > 0: self.db.update(self.batch5) self.batch5.cleanBatch() if self.batch6.getSize() > 0: self.db.update(self.batch6) self.batch6.cleanBatch() print('rest_list数:', len(self.rest_category.keys())) print('rest_category数:', sum([len(x) for x in self.rest_category.values()])) print(self.city_cn, "错误店铺数:", len(self.err_rest_ids)) # with open("err_rest_ids_" + city_cn + ".txt", "w") as f: # for item in err_rest_ids: # f.write(str(item) + '\n') return self.err_rest_ids
class ParseScore(object): def __init__(self,resource_path, city, date,save_date_short, err_rest_ids,db): ''' :param resource_path: 城市数据文件路径 :param city: 城市拼音 :param date: 保存日期 :param save_date_short:保存日期简写 :param err_rest_ids: 错误店铺ID :param db: 数据库配置文件 ''' self.resource_path = resource_path self.city = city self.date = date self.save_date_short = save_date_short self.err_rest_ids = err_rest_ids self.db = db self.sql = "insert into t_e_rest_score_city_%s values" %save_date_short self.batch = BatchSql(self.sql) def parse_data(self,data): try: rest_id = data['param'] if rest_id in self.err_rest_ids: return if len(rest_id) <= 11: self.batch.addBatch([self.city, self.date, rest_id, getMapValue(data['data'], "compare_rating"), getMapValue(data['data'], "deliver_time"), getMapValue(data['data'], "food_score"), getMapValue(data['data'], "overall_score"), getMapValue(data['data'], "service_score")]) if self.batch.getSize() > 100000: self.db.update(self.batch) self.batch.cleanBatch() else: print("错误数据", data) print('错误数据\n{}'.format(data)) except Exception as e: print("解析数据报错", e) print('解析数据报错\n错误数据:{}'.format(data)) def run(self): f_name = os.path.join(self.resource_path, 'score.pickle') print("解析文件:", f_name) with open(f_name, "rb+") as f: while True: try: data = pickle.load(f) self.parse_data(data) except EOFError as e: print("文件读取完成", e) break except Exception as e2: print("报错", e2) break if self.batch.getSize() > 0: self.db.update(self.batch) self.batch.cleanBatch()
def deal_base_data(base_data): # 基本信息数据入库 batch = BatchSql(sql) batch.addBatch(base_data) db.update(batch)
class ParseMenu(object): def __init__(self, resource_path, city, date, city_short, save_date_short, err_rest_ids, db): ''' :param resource_path: 数据文件路径 :param city: 城市拼音 :param date: 保存日期 :param city_short: 城市简写 :param save_date_short: 保存日期简写 :param err_rest_ids: 错误城市id :param db: 数据库配置信息 ''' self.resource_path = resource_path self.city = city self.date = date self.city_short = city_short self.save_date_short = save_date_short self.err_rest_ids = err_rest_ids self.db = db self.sql1 = "insert into t_e_rest_menu_level1_city_%s values" % save_date_short self.sql2 = "insert into t_e_rest_menu_level2_city_%s_%s values" % ( city_short, save_date_short) self.sql3 = "insert into t_e_rest_menu_level2_unique_city_%s_%s values" % ( city_short, save_date_short) self.sql4 = "insert into t_e_rest_menu_discount_city_%s values" % save_date_short self.batch1 = BatchSql(self.sql1) self.batch2 = BatchSql(self.sql2) self.batch3 = BatchSql(self.sql3) self.batch4 = BatchSql(self.sql4) self.rest_food = defaultdict(set) def parse_data(self, data): try: rest_id = getMapValue(data, 'param') if rest_id in self.err_rest_ids: return for item in data["data"]: # 验证是否是错误数据 try: foods = item['foods'][::-1] except KeyError: print('错误数据\n{}'.format(item)) continue menu_name = getMapValue(item, 'name') self.batch1.addBatch( [self.city, self.date, rest_id, menu_name]) if self.batch1.getSize() > 100000: self.db.update(self.batch1) self.batch1.cleanBatch() for item2 in foods: food_id = getMapValue(item2, 'virtual_food_id') food_price_current = getMapValue(item2["specfoods"][0], 'price') food_price_primary = getMapValue(item2["specfoods"][0], 'original_price') has_activity = '0' if food_price_primary not in ['-999', '0']: has_activity = '1' self.batch2.addBatch([ self.city, self.date, rest_id, food_id, getMapValue(item2, 'name'), getMapValue(item2, 'month_sales'), getMapValue(item2, 'rating'), getMapValue(item2, 'satisfy_count'), food_price_current, has_activity, menu_name ]) if self.batch2.getSize() > 100000: self.db.update(self.batch2) self.batch2.cleanBatch() if food_id not in self.rest_food[rest_id]: self.rest_food[rest_id].add(food_id) self.batch3.addBatch([ self.city, self.date, rest_id, food_id, getMapValue(item2, 'name'), getMapValue(item2, 'month_sales'), getMapValue(item2, 'rating'), getMapValue(item2, 'satisfy_count'), food_price_current, has_activity, menu_name ]) if self.batch3.getSize() > 100000: self.db.update(self.batch3) self.batch3.cleanBatch() if food_price_primary not in ['-999', '0']: discount = '%.6f' % (float(food_price_current) / float(food_price_primary)) self.batch4.addBatch([ self.city, self.date, rest_id, food_id, food_price_primary, food_price_current, discount ]) if self.batch4.getSize() > 100000: self.db.update(self.batch4) self.batch4.cleanBatch() except Exception as e: print("解析数据报错", e) print('解析数据报错\n错误数据:{}'.format(data)) def run(self): f_name = os.path.join(self.resource_path, 'menu.pickle') print("解析文件:", f_name) with open(f_name, "rb+") as f: while 1: try: data = pickle.load(f) self.parse_data(data) # break except EOFError as e: print("文件读取完成", e) break except Exception as e2: print("报错", e2) break if self.batch1.getSize() > 0: self.db.update(self.batch1) self.batch1.cleanBatch() if self.batch2.getSize() > 0: self.db.update(self.batch2) self.batch2.cleanBatch() if self.batch3.getSize() > 0: self.db.update(self.batch3) self.batch3.cleanBatch() if self.batch4.getSize() > 0: self.db.update(self.batch4) self.batch4.cleanBatch()
class ParseMenuObject(ParseDataObject): """ category.pickle数据解析 """ def __init__(self, resource_path, city, date, rest_area, err_rest_ids, db, is_test=False): ParseDataObject.__init__(self, resource_path, is_test) self.err_rest_ids = err_rest_ids self.rest_food = defaultdict(set) self.city = city self.date = date self.rest_area = rest_area self.db = db sql1 = "insert into t_e_rest_menu_level1_import values" sql2 = "insert into t_e_rest_menu_level2_import values" sql3 = "insert into t_e_rest_menu_level2_unique_import values" sql4 = "insert into t_e_rest_menu_discount_import values" self.batch1 = BatchSql(sql1) self.batch2 = BatchSql(sql2) self.batch3 = BatchSql(sql3) self.batch4 = BatchSql(sql4) def parse(self, generator=None): generator = generator if generator else self.GeneratorPickleData() try: for data in generator: self.parse_data(data) self.insertAll() except: print("解析数据报错:", traceback.print_exc()) def insert(self, batch, param): batch.addBatch(param) if batch.getSize() > 100000: self.db.update(batch) batch.cleanBatch() def insertAll(self): if self.batch1.getSize() > 0: self.db.update(self.batch1) self.batch1.cleanBatch() if self.batch2.getSize() > 0: self.db.update(self.batch2) self.batch2.cleanBatch() if self.batch3.getSize() > 0: self.db.update(self.batch3) self.batch3.cleanBatch() if self.batch4.getSize() > 0: self.db.update(self.batch4) self.batch4.cleanBatch() def parse_data(self, data): try: rest_id = getMapValue(data, 'param') # TODO err_rest_ids需要传入 if rest_id in self.err_rest_ids: return for item in data["data"]: # 验证是否是错误数据 try: foods = item['foods'][::-1] except KeyError: print('错误数据\n{}'.format(item)) continue menu_name = getMapValue(item, 'name') param1 = [ self.city, self.date, self.rest_area, rest_id, menu_name ] self.insert(self.batch1, param1) for item2 in foods: food_id = getMapValue(item2, 'virtual_food_id') food_price_current = getMapValue(item2["specfoods"][0], 'price') food_price_primary = getMapValue(item2["specfoods"][0], 'original_price') has_activity = '0' if food_price_primary not in ['-999', '0']: has_activity = '1' param2 = [ self.city, self.date, self.rest_area, rest_id, food_id, getMapValue(item2, 'name'), getMapValue(item2, 'month_sales'), getMapValue(item2, 'rating'), getMapValue(item2, 'satisfy_count'), food_price_current, has_activity, menu_name ] self.insert(self.batch2, param2) if food_id not in self.rest_food[rest_id]: self.rest_food[rest_id].add(food_id) param3 = [ self.city, self.date, self.rest_area, rest_id, food_id, getMapValue(item2, 'name'), getMapValue(item2, 'month_sales'), getMapValue(item2, 'rating'), getMapValue(item2, 'satisfy_count'), food_price_current, has_activity, menu_name ] self.insert(self.batch3, param3) if food_price_primary not in ['-999', '0']: discount = '%.6f' % (float(food_price_current) / float(food_price_primary)) param4 = [ self.city, self.date, self.rest_area, rest_id, food_id, food_price_primary, food_price_current, discount ] self.insert(self.batch4, param4) except Exception as e: print("解析数据报错", traceback.print_exc()) print('解析数据报错\n错误数据:{}'.format(data))
class ParseRatingObject(ParseDataObject): """ category.pickle数据解析 """ def __init__(self, resource_path, city, date, rest_area, db, is_test=False): ParseDataObject.__init__(self, resource_path, is_test) self.err_rest_ids = set() self.city = city self.date = date self.rest_area = rest_area self.db = db sql1 = "insert into t_e_rest_rating_import values" sql2 = "insert into t_e_rest_food_rating_import values" self.batch1 = BatchSql(sql1) self.batch2 = BatchSql(sql2) def parse(self, generator=None): generator = generator if generator else self.GeneratorPickleData() try: for data in generator: self.parse_data(data) self.insertAll() except: print("解析数据报错:", traceback.print_exc()) def insert(self,batch,param): batch.addBatch(param) if batch.getSize() > 100000: self.db.update(batch) batch.cleanBatch() def insertAll(self): if self.batch1.getSize() > 0: self.db.update(self.batch1) self.batch1.cleanBatch() if self.batch2.getSize() > 0: self.db.update(self.batch2) self.batch2.cleanBatch() def parse_data(self,data): print("data",data) try: rest_id = str(data["param"]) if len(rest_id) <= 11: item1 = data["data"] print('item1',item1) rating_id = str(uuid.uuid1()) print("rating_id",rating_id) rate_time = getMapValue(item1, "rated_at") rating_text = getMapValue(item1, "rating_text") reply_text = getMapValue(item1, "reply_text") rating_star = getMapValue(item1, "rating_star") username = getMapValue(item1, "username") param1 = [self.city, self.date,self.rest_area,rest_id,rating_id, rate_time, rating_text,reply_text,rating_star,username] self.insert(self.batch1,param1) for item2 in item1["item_ratings"]: food_id = getMapValue(item2, "food_id") food_name = getMapValue(item2,"food_name") image_hash = getMapValue(item2, "image_hash") param2 = [self.city, self.date,self.rest_area, rest_id, rating_id,rate_time, food_id,image_hash,food_name,rating_star,rating_text,reply_text] self.insert(self.batch2, param2) else: print("错误数据", data) except Exception as e: print(traceback.print_exc()) print('解析数据报错\n错误数据:{}'.format(data))
def __init__(self, resource_path, city, city_cn, date, date_short, db): ''' :param resource_path: 城市数据文件路径 :param city: 城市拼音 :param city_cn: 城市中文 :param date: 保存日期 :param date_short: 保存日期简写 :param db: 数据配置 :return: ''' self.resource_path = resource_path self.city = city self.city_cn = city_cn self.date = date self.date_short = date_short self.db = db self.sql1 = "insert into t_e_rest_list_city_pre_%s values" % date_short self.sql2 = "insert into t_e_rest_active_city_%s values" % date_short self.sql3 = "insert into t_e_rest_category_city_%s values" % date_short self.sql4 = "insert into t_e_rest_open_time_city_%s values" % date_short self.sql5 = "insert into t_e_rest_money_off_city_%s values" % date_short self.sql6 = "insert into t_e_rest_money_off_avg_city_%s values" % date_short self.batch1 = BatchSql(self.sql1) self.batch2 = BatchSql(self.sql2) self.batch3 = BatchSql(self.sql3) self.batch4 = BatchSql(self.sql4) self.batch5 = BatchSql(self.sql5) self.batch6 = BatchSql(self.sql6) self.err_rest_ids = set() self.rest_category = defaultdict(set) self.category_level1_list = [] self.category_level2_list = []
import os import pickle from util.DB.DAO import BatchSql from util.DB.DAO import DBUtils from ShopMonitor.elm.ElmWebParse.ParseTools import getMapValue sql1 = "insert into tmp_e_rest_rating_import(city,date,rest_id,rate_time,rating_txt,reply_time,reply_txt," \ "rating_star,user_name)values" sql2 = "insert into tmp_e_rest_food_rating_import(city,date,rest_id,rating_id,rated_at,food_id,image_hash,rate_name," \ "rating_star,rating_text,reply_at,replay_text)values" db = DBUtils(('192.168.1.200', 3306, 'njjs', 'njjs1234', 'exdata', 'utf8mb4')) batch1 = BatchSql(sql1) batch2 = BatchSql(sql2) def parse_data(data, city, date): try: rest_id = data["param"][0] if len(rest_id) <= 11: for item1 in data["data"]: rate_time = getMapValue(item1, "rated_at"), batch1.addBatch([city, date, rest_id, rate_time, getMapValue(item1, "rating_text"), getMapValue(item1, "reply_at"), getMapValue(item1, "reply_text"), getMapValue(item1, "rating_star"), getMapValue(item1, "username")]) if batch1.getSize() > 100000:
class ParseScoreObject(ParseDataObject): """ category.pickle数据解析 """ def __init__(self, resource_path, city, date, rest_area, err_rest_ids, db, is_test=False): ParseDataObject.__init__(self, resource_path, is_test) self.err_rest_ids = err_rest_ids self.city = city self.date = date self.rest_area = rest_area self.db = db sql = "insert into t_e_rest_score_import values" self.batch1 = BatchSql(sql) def parse(self, generator=None): generator = generator if generator else self.GeneratorPickleData() try: for data in generator: self.parse_data(data) self.insertAll() except: print("解析数据报错:", traceback.print_exc()) def insert(self, batch, param): batch.addBatch(param) if batch.getSize() > 100000: self.db.update(batch) batch.cleanBatch() def insertAll(self): if self.batch1.getSize() > 0: self.db.update(self.batch1) self.batch1.cleanBatch() def parse_data(self, data): try: rest_id = data['param'] if rest_id in self.err_rest_ids: return if len(rest_id) <= 11: param1 = [ self.city, self.date, self.rest_area, rest_id, getMapValue(data['data'], "compare_rating"), getMapValue(data['data'], "deliver_time"), getMapValue(data['data'], "food_score"), getMapValue(data['data'], "overall_score"), getMapValue(data['data'], "service_score") ] self.insert(self.batch1, param1) else: print("错误数据", data) except Exception as e: print('解析数据报错\n错误数据:{}'.format(data))
def deal_quan_position(): ''' 处理店圈的店铺列表信息,获取1.5,3,5公里的店圈id列表信息,并写入数据表quan_position中 :return: ''' for i in range(len(area_shop_id)): #遍历所有店铺列表 for j in range(0,3): #获取三种店圈半径类型的数据 sql = """ SELECT t1.rest_id FROM t_e_rest_list_city_1712 t1 where t1.longitude IN (SELECT t1.longitude FROM t_e_rest_list_city_1712 t1 WHERE t1.city = 'nanjing' AND round( 6378.138 * 2 * asin( sqrt( pow( sin( ( t1.latitude * pi() / 180 - %s * pi() / 180 ) / 2 ), 2 ) + cos(t1.latitude * pi() / 180) * cos(%s * pi() / 180) * pow( sin( ( t1.longitude * pi() / 180 - %s * pi() / 180 ) / 2 ), 2 ) ) ) * 1000 ) <= %s) and t1.latitude IN (SELECT t1.latitude FROM t_e_rest_list_city_1712 t1 WHERE t1.city = 'nanjing' AND round( 6378.138 * 2 * asin( sqrt( pow( sin( ( t1.latitude * pi() / 180 - %s * pi() / 180 ) / 2 ), 2 ) + cos(t1.latitude * pi() / 180) * cos(%s * pi() / 180) * pow( sin( ( t1.longitude * pi() / 180 - %s * pi() / 180 ) / 2 ), 2 ) ) ) * 1000 ) <= %s) """%(shop_latitude[i],shop_latitude[i],shop_longitude[i],type[j],shop_latitude[i],shop_latitude[i],shop_longitude[i],type[j]) # print(sql) area_shop_list = db2.queryForList(sql,None) #获取店圈的店铺id信息 all_area_shop_list = [] for item in area_shop_list: all_area_shop_list.append(item[0]) all_str_shop_ids = str(all_area_shop_list) #店铺列表信息的简单处理,将[]去除 all_str_shop_ids = all_str_shop_ids.split('[')[1] all_str_shop_ids = all_str_shop_ids.split(']')[0] sql2 = 'insert into quan_position values' quan_position = [] quan_position.extend([0,'2017-12-31',area_shop_id[i],area_shop_name[i],j+1,all_str_shop_ids,time,time]) print("店圈列表信息:",quan_position) batch = BatchSql(sql2) batch.addBatch(quan_position) db1.update(batch)