Пример #1
0
class ParseDeliveymode(object):
    def __init__(self,resource_path, city, date,save_date_short,db):
        '''
        :param resource_path: 城市数据文件路径
        :param city: 城市拼音
        :param date: 保存日期
        :param save_date_short:保存日期简写
        :param db: 数据库配置信息
        '''
        self.resource_path = resource_path
        self.city = city
        self.date = date
        self.save_date_short = save_date_short
        self.db = db

        self.sql = "insert into t_e_delivery_mode_city_pre_%s values" % save_date_short
        self.batch = BatchSql(self.sql)

    def parse_data(self, data):
        try:
            rest_id = str(data['param'])
            if len(rest_id) <= 11:
                data = data['data'].get('delivery_mode')
                delever_id = getMapValue(data,"id")
                delever_text = getMapValue(data, "text")
                param = [self.city, self.date,rest_id,delever_text,delever_id]
                self.batch.addBatch(param)
                if self.batch.getSize() > 100000:
                    self.db.update(self.batch)
                    self.batch.cleanBatch()
            else:
                print("错误数据", data)
        except Exception as e:
            print(traceback.print_exc())
            print('解析数据报错\n错误数据:{}'.format(data))

    def run(self):
        f_name = os.path.join(self.resource_path, 'rest_info.pickle')
        print("解析文件:", f_name)
        with open(f_name, "rb+") as f:
            while True:
                try:
                    data = pickle.load(f)
                    self.parse_data(data)
                except EOFError as e:
                    print("文件读取完成", e)
                    break
                except Exception as e2:
                    print("报错", e2)
                    break

        if self.batch.getSize() > 0:
            self.db.update(self.batch)
            self.batch.cleanBatch()
class ParseRatingTagObject(ParseDataObject):
    """
        category.pickle数据解析
    """
    def __init__(self,
                 resource_path,
                 city,
                 date,
                 rest_area,
                 err_rest_ids,
                 db,
                 is_test=False):
        ParseDataObject.__init__(self, resource_path, is_test)
        self.err_rest_ids = err_rest_ids
        self.city = city
        self.date = date
        self.rest_area = rest_area

        self.db = db
        sql = "insert into t_e_rest_rating_tag_import values"
        self.batch1 = BatchSql(sql)

    def parse(self, generator=None):
        generator = generator if generator else self.GeneratorPickleData()
        try:
            for data in generator:
                self.parse_data(data)
            self.insertAll()
        except:
            print("解析数据报错:", traceback.print_exc())

    def insert(self, batch, param):
        batch.addBatch(param)
        if batch.getSize() > 100000:
            self.db.update(batch)
            batch.cleanBatch()

    def insertAll(self):
        if self.batch1.getSize() > 0:
            self.db.update(self.batch1)
            self.batch1.cleanBatch()

    def parse_data(self, data):
        try:
            rest_id = data['param']
            if rest_id in self.err_rest_ids:
                return
            if len(rest_id) <= 11:
                for item in data["data"]:
                    param1 = [
                        self.city, self.date, self.rest_area, rest_id,
                        getMapValue(item, "name"),
                        getMapValue(item, "count")
                    ]
                    self.insert(self.batch1, param1)
            else:
                print("错误数据", data)

        except Exception as e:
            print("解析数据报错", e)
Пример #3
0
def deal_quan_category(city,date):
    '''
    处理店圈的菜品类信息,一级品类,二级品类信息
    :param city:
    :param date:
    :return:
    '''
    sql = """
        select DISTINCT(t1.category_id_level1),t1.category_name_level1,0 pid         
        from t_e_rest_category_city_1712 t1
        where t1.city = '%s'
        and t1.date = '%s'
        union 
        select DISTINCT(t1.category_id_level2),t1.category_name_level2,t1.category_id_level1 pid         
        from t_e_rest_category_city_1712 t1
        where t1.city = '%s'
        and t1.date = '%s'
        
          """%(city,date,city,date)
    result = db2.queryForList(sql, None)
    print(result)

    sql2 = "insert into quan_category values"
    batch = BatchSql(sql2)

    for item in result:
        quan_category = []
        quan_category.extend([0,item[1],date,item[0],item[2],time,time])
        print(quan_category)
        batch.addBatch(quan_category)
        if batch.getSize() > 10000:
            db1.update(batch)
            batch.cleanBatch()
    db1.update(batch)
Пример #4
0
def deal_quan_monitor_activity_data(city,date,all_shopid_list):
    '''
    处理店圈店铺的活动信息
    :param city: 所属城市
    :param date: 数据日期
    :param all_shopid_list: 店圈店铺的ID列表
    :return:
    '''
    sql="""
        select t1.rest_id,t1.description,t1.active_type
        from t_e_rest_active_city_1712 t1
        where t1.city  = '%s'
        and t1.date = '%s'
        and t1.rest_id IN %s
        """%(city,date,all_shopid_list)
    sql2 = "insert into quan_monitor_activity_data values"
    batch = BatchSql(sql2)
    result = db2.queryForList(sql, None)
    print(result)

    for shoplist in all_shopid_list:
        all_activity = []
        quan_monitor_activity_data = []
        for item in result:
            activity = dict(type=item[2],content=item[1])
            if item[0] == shoplist:
                all_activity.append(activity)
        all_activity = json.dumps(all_activity)
        quan_monitor_activity_data.extend([0,date,shoplist,str(all_activity),time,time])
        print(quan_monitor_activity_data)
        batch.addBatch(quan_monitor_activity_data)
        if batch.getSize() > 10000:
            db1.update(batch)
            batch.cleanBatch()
    db1.update(batch)
Пример #5
0
def deal_quan_shop(city,date,all_shopid_list,cate_list):
    '''
    处理店圈店铺的基本数据
    :param city:  店圈所属城市
    :param date:  数据分析日期
    :param all_shopid_list:  店圈数据的店铺列表的数据
    :param cate_list: 店圈数据的品类列表
    :return:
    '''
    sql = """
          select t1.rest_id,t1.rest_name,t1.delivery_id,t1.longitude,t1.latitude,t1.is_new
          from t_e_rest_list_city_1712 t1
          where t1.city = '%s'
          and t1.date = '%s'
          and t1.rest_id IN %s
          """%(city,date,all_shopid_list)

    sql2 = "insert into quan_shop values"
    batch = BatchSql(sql2)

    result = db2.queryForList(sql, None)
    print(result)
    for item in result:
        for cate in cate_list:
            if item[0] == cate[0]:
                quan_shop = []
                quan_shop.extend([0,item[0],1,date,item[1],item[2],item[3],item[4],item[5],cate[1],time,time])
                print(quan_shop)
                batch.addBatch(quan_shop)

        if batch.getSize() > 1000:
            db1.update(batch)
            batch.cleanBatch()
    db1.update(batch)
Пример #6
0
class ParseDeliveryMode(ParseDataObject):
    """
        category.pickle数据解析
    """
    def __init__(self,
                 resource_path,
                 city,
                 date,
                 rest_area,
                 db,
                 is_test=False):
        ParseDataObject.__init__(self, resource_path, is_test)
        self.city = city
        self.date = date
        self.rest_area = rest_area

        self.db = db

        sql = "insert into t_e_delivery_mode_import values"
        self.batch1 = BatchSql(sql)

    def parse(self, generator=None):
        generator = generator if generator else self.GeneratorPickleData()
        try:
            for data in generator:
                self.parse_data(data)
            self.insertAll()
        except:
            print("解析数据报错:", traceback.print_exc())

    def insert(self, batch, param):
        batch.addBatch(param)
        if batch.getSize() > 100000:
            self.db.update(batch)
            batch.cleanBatch()

    def insertAll(self):
        if self.batch1.getSize() > 0:
            self.db.update(self.batch1)
            self.batch1.cleanBatch()

    def parse_data(self, data):
        # print(data)
        try:
            rest_id = str(data['param'])
            if len(rest_id) <= 11:
                data = data['data'].get('delivery_mode')
                delever_id = getMapValue(data, "id")
                delever_text = getMapValue(data, "text")
                param1 = [
                    self.city, self.date, self.rest_area, rest_id,
                    delever_text, delever_id
                ]
                self.insert(self.batch1, param1)
            else:
                print("错误数据", data)
        except Exception as e:
            print(traceback.print_exc())
            print('解析数据报错\n错误数据:{}'.format(data))
Пример #7
0
def deal_quan_monitor_hot_sku_data(city,date,all_shopid_list):
    '''
    处理店圈sku数据,9个店铺大约110万条数据
    :param city: 所属城市
    :param date: 数据日期
    :param all_shopid_list: 店圈店铺ID列表
    :return:
    '''
    sql="""
        select t1.rest_id,t1.food_name,t1.price,t1.food_id,t1.food_month_sales,t1.has_activity,t2.delivery_fee
        from t_e_rest_menu_level2_unique_city_nj_1712 t1,
             t_e_rest_list_city_1712 t2   
        where t1.city = '%s'
        and t1.date = '%s'
        and t1.rest_id IN %s
        and t1.food_month_sales != 0
        and t1.city = t2.city
        and t1.date = t2.date
        and t1.rest_id = t2.rest_id
        """%(city,date,all_shopid_list)
    sql2 = 'insert into quan_monitor_hot_sku_data values'
    batch = BatchSql(sql2)
    result = db2.queryForList(sql, None) 
    print(result)

    for shop_list in all_shopid_list:
        for item in result:
            quan_monitor_hot_sku_data = []
            if shop_list == item[0]:
                if item[5] == 1:   #特价菜
                    quan_monitor_hot_sku_data.extend([0,date,shop_list,item[1],item[2],item[2],item[4],time,time])
                    print(quan_monitor_hot_sku_data)
                    batch.addBatch(quan_monitor_hot_sku_data)
                elif item[5] == 0:  #满减菜品
                    sub_price = get_sub_price(city,date,shop_list,item[2])
                    if sub_price == None:
                        sub_price = 0
                    true_price = item[2]-sub_price
                    quan_monitor_hot_sku_data.extend([0,date,shop_list,item[1],item[2],true_price,item[4],time,time])
                    print(quan_monitor_hot_sku_data)
                    batch.addBatch(quan_monitor_hot_sku_data)
        if batch.getSize() > 10000:
            db1.update(batch)
            batch.cleanBatch()
    db1.update(batch)
Пример #8
0
def deal_quan_monitor_shop_data(city,date,all_shopid_list):
    '''
    处理店圈的配送费用和配送时间的信息
    :param city: 店圈所属城市
    :param date: 爬取数据的日期
    :param all_shopid_list: 所有店圈的店铺列表信息
    :return:
    '''
    sql = """
          select t1.rest_id,t1.order_month_sales,t2.deliver_time,t1.delivery_fee,t1.min_delivery_price,t2.overall_score
          from t_e_rest_list_city_1712 t1,
               t_e_rest_score_city_1712 t2
          where t1.city  = '%s'
          and t1.date = '%s'
          and t1.rest_id IN %s
          and t1.rest_id = t2.rest_id
          and t1.city = t2.city
          and t1.date = t2.date
          """%(city,date,all_shopid_list)

    sql2 = "insert into quan_monitor_shop_data values"
    batch = BatchSql(sql2)

    result = db2.queryForList(sql, None)

    for item in result:
        quan_monitor_shop_data = []
        score = float('%.1f'%float(item[5]))
        quan_monitor_shop_data.extend([0,date,item[0],item[1],item[2],item[3],item[4],score,time,time])
        print(quan_monitor_shop_data)
        batch.addBatch(quan_monitor_shop_data)

        if batch.getSize() > 10000:
            db1.update(batch)
            batch.cleanBatch()
    db1.update(batch)
Пример #9
0
class ParseHotword(object):
    def __init__(self, resource_path, city, date, save_date_short,
                 err_rest_ids, db):
        '''
        :param resource_path: 城市数据文件路径
        :param city: 城市拼音
        :param date: 保存日期
        :param save_date_short:保存日期简写
        :param err_rest_ids: 错误店铺ID
        :param db: 数据库配置信息
        '''
        self.resource_path = resource_path
        self.city = city
        self.date = date
        self.save_date_short = save_date_short
        self.err_rest_ids = err_rest_ids
        self.db = db

        self.sql = "insert into t_e_hot_search_word_city_%s values" % save_date_short
        self.batch = BatchSql(self.sql)

    def parse_data(self, data):
        try:
            rest_id = str(data["param"][0])
            if rest_id in self.err_rest_ids:
                return
            if len(rest_id) <= 11:
                la = data["param"][1]
                lo = data["param"][2]
                for item in data["data"]:
                    search_word = getMapValue(item, "search_word")
                    if search_word != '-999':
                        self.batch.addBatch([
                            self.city, self.date, rest_id, la, lo, search_word
                        ])
                    if self.batch.getSize() > 100000:
                        self.db.update(self.batch)
                        self.batch.cleanBatch()
            else:
                print("错误数据", data)
                print('错误数据\n{}'.format(data))
        except Exception as e:
            print("解析数据报错", e)
            print('解析数据报错\n错误数据:{}'.format(data))

    def run(self):
        f_name = os.path.join(self.resource_path, 'hot_word.pickle')
        print("解析文件:", f_name)
        with open(f_name, "rb+") as f:
            while True:
                try:
                    data = pickle.load(f)
                    self.parse_data(data)
                except EOFError as e:
                    print("文件读取完成", e)
                    break
                except Exception as e2:
                    print("报错", e2)
                    break

        if self.batch.getSize() > 0:
            self.db.update(self.batch)
            self.batch.cleanBatch()
class ParseCategoryObject(ParseDataObject):
    """
        category.pickle数据解析
    """
    def __init__(self,
                 resource_path,
                 city,
                 date,
                 rest_area,
                 db,
                 is_test=False):
        ParseDataObject.__init__(self, resource_path, is_test)
        self.err_rest_ids = set()
        self.rest_category = defaultdict(set)

        self.city = city
        self.date = date
        self.rest_area = rest_area

        self.db = db
        sql1 = "insert into t_e_rest_list_import values"
        sql2 = "insert into t_e_rest_active_import values"
        sql3 = "insert into t_e_rest_category_import values"
        sql4 = "insert into t_e_rest_open_time_import values"
        sql5 = "insert into t_e_rest_money_off_import values"
        sql6 = "insert into t_e_rest_money_off_avg_import values"

        self.batch1 = BatchSql(sql1)
        self.batch2 = BatchSql(sql2)
        self.batch3 = BatchSql(sql3)
        self.batch4 = BatchSql(sql4)
        self.batch5 = BatchSql(sql5)
        self.batch6 = BatchSql(sql6)

    def parse(self, generator=None):
        generator = generator if generator else self.GeneratorPickleData()
        try:
            for data in generator:
                self.parse_data(data)
            self.insertAll()
        except:
            print("解析数据报错:", traceback.print_exc())

    def insert(self, batch, param):
        batch.addBatch(param)
        if batch.getSize() > 100000:
            self.db.update(batch)
            batch.cleanBatch()

    def insertAll(self):
        if self.batch1.getSize() > 0:
            self.db.update(self.batch1)
            self.batch1.cleanBatch()
        if self.batch2.getSize() > 0:
            self.db.update(self.batch2)
            self.batch2.cleanBatch()
        if self.batch3.getSize() > 0:
            self.db.update(self.batch3)
            self.batch3.cleanBatch()
        if self.batch4.getSize() > 0:
            self.db.update(self.batch4)
            self.batch4.cleanBatch()
        if self.batch5.getSize() > 0:
            self.db.update(self.batch5)
            self.batch5.cleanBatch()
        if self.batch6.getSize() > 0:
            self.db.update(self.batch6)
            self.batch6.cleanBatch()

    def parse_data(self, data):
        try:
            category_item = data['param'][2]
            category_level2_id = str(category_item.get("id2"))
            item = data["data"]
            rest_id = getMapValue(item, "id")
            lat = float(getMapValue(item, "latitude"))
            lng = float(getMapValue(item, "longitude"))
            if len(rest_id) > 11:
                print('错误店铺id\nrest_id: {}'.format(rest_id))
            # elif not (self.polygonHellp.is_location_in_polygon(lng, lat)):
            #     self.err_rest_ids.add(rest_id)
            #     print('错误地址\nrest_id: {}, lat: {}, lng: {}, name: {}\n'
            #               'address: {}'.format(rest_id, lat, lng, getMapValue(item, "name"),
            #                                    getMapValue(item, "address")))

            else:
                # 店铺去重
                if not self.rest_category[rest_id]:
                    param1 = [
                        self.city, self.date, self.rest_area, rest_id,
                        getMapValue(item, "name"),
                        getMapValue(item, "phone", '/'),
                        getMapValue(item, "address"),
                        getMapValue(item, "average_cost"),
                        [1, 0][getMapValue(item, "delivery_mode") == '-999'],
                        getMapValue(item, "float_delivery_fee"),
                        getMapValue(item, "float_minimum_order_amount"),
                        [0, 1][getMapValue(item, "is_new") == 'True'],
                        [0, 1][getMapValue(item, "is_premium") == 'True'], lat,
                        lng,
                        getMapValue(item, "recent_order_num"), '-999'
                    ]
                    self.insert(self.batch1, param1)

                    opening_hours = getMapValue(item, "opening_hours")
                    arr = json.loads(opening_hours.replace("\'", "\""))
                    if not isinstance(arr, list):
                        print('错误数据:', rest_id, opening_hours)
                    else:
                        for s in arr:
                            times = s.split("/")
                            param4 = [
                                self.city, self.date, self.rest_area, rest_id,
                                *times
                            ]
                            self.insert(self.batch4, param4)

                    # TODO 平均满减折扣有问题
                    discount_rate_list = []
                    for active in item.get('activities'):
                        attribute = getMapValue(active, "attribute")
                        active_type = getMapValue(active, "icon_name")
                        param2 = [
                            self.city, self.date, self.rest_area, rest_id,
                            getMapValue(active, "description"), active_type
                        ]

                        self.insert(self.batch2, param2)

                        if active_type == '减' and attribute != '-999':
                            arr = json.loads(attribute)
                            for key, value in arr.items():
                                if isinstance(value, dict):
                                    sub_price = value.get("1")
                                    if sub_price == 0:
                                        sub_price = value.get("0")
                                else:
                                    sub_price = value
                                key = round(float(key), 2)
                                sub_price = round(float(sub_price), 2)
                                discount_rate = round(sub_price / key, 4)
                                discount_rate_list.append(discount_rate)
                                param5 = [
                                    self.city, self.date, self.rest_area,
                                    rest_id, key, sub_price, discount_rate
                                ]
                                self.insert(self.batch5, param5)

                    # TODO 平均满减折扣有问题
                    if len(discount_rate_list):
                        avg_discount_rate = sum(discount_rate_list) / len(
                            discount_rate_list)
                        avg_discount_rate = round(avg_discount_rate, 4)
                        param6 = [
                            self.city, self.date, self.rest_area, rest_id,
                            avg_discount_rate
                        ]
                        self.insert(self.batch6, param6)

                # 店铺对应品类去重
                if category_level2_id not in self.rest_category[rest_id]:
                    self.rest_category[rest_id].add(category_level2_id)

                    category_level1_id = category_item.get('id1')
                    category_level1_name = category_item.get('name')
                    category_level2_name = category_item.get('name2')
                    param3 = [
                        self.city, self.date, self.rest_area,
                        category_level1_id, category_level1_name,
                        category_level2_id, category_level2_name, rest_id,
                        getMapValue(item, "name")
                    ]
                    self.insert(self.batch3, param3)

        except Exception:
            # print("解析数据报错", traceback.format_exc())
            print('解析数据报错\n错误数据:{}'.format(data))
Пример #11
0
class ParseScore(object):
    def __init__(self,resource_path, city, date,save_date_short, err_rest_ids,db):
        '''
        :param resource_path: 城市数据文件路径
        :param city: 城市拼音
        :param date: 保存日期
        :param save_date_short:保存日期简写
        :param err_rest_ids: 错误店铺ID
        :param db: 数据库配置文件
        '''
        self.resource_path = resource_path
        self.city = city
        self.date = date
        self.save_date_short = save_date_short
        self.err_rest_ids = err_rest_ids
        self.db = db

        self.sql = "insert into t_e_rest_score_city_%s values" %save_date_short
        self.batch = BatchSql(self.sql)

    def parse_data(self,data):
        try:
            rest_id = data['param']
            if rest_id in self.err_rest_ids:
                return
            if len(rest_id) <= 11:
                self.batch.addBatch([self.city, self.date, rest_id,
                                getMapValue(data['data'], "compare_rating"),
                                getMapValue(data['data'], "deliver_time"),
                                getMapValue(data['data'], "food_score"),
                                getMapValue(data['data'], "overall_score"),
                                getMapValue(data['data'], "service_score")])
                if self.batch.getSize() > 100000:
                    self.db.update(self.batch)
                    self.batch.cleanBatch()
            else:
                print("错误数据", data)
                print('错误数据\n{}'.format(data))
        except Exception as e:
            print("解析数据报错", e)
            print('解析数据报错\n错误数据:{}'.format(data))


    def run(self):
        f_name = os.path.join(self.resource_path, 'score.pickle')
        print("解析文件:", f_name)
        with open(f_name, "rb+") as f:
            while True:
                try:
                    data = pickle.load(f)
                    self.parse_data(data)
                except EOFError as e:
                    print("文件读取完成", e)
                    break
                except Exception as e2:
                    print("报错", e2)
                    break

        if self.batch.getSize() > 0:
            self.db.update(self.batch)
            self.batch.cleanBatch()
Пример #12
0
class ParseMenu(object):
    def __init__(self, resource_path, city, date, city_short, save_date_short,
                 err_rest_ids, db):
        '''
        :param resource_path: 数据文件路径
        :param city: 城市拼音
        :param date: 保存日期
        :param city_short: 城市简写
        :param save_date_short: 保存日期简写
        :param err_rest_ids: 错误城市id
        :param db: 数据库配置信息
        '''
        self.resource_path = resource_path
        self.city = city
        self.date = date
        self.city_short = city_short
        self.save_date_short = save_date_short
        self.err_rest_ids = err_rest_ids
        self.db = db

        self.sql1 = "insert into t_e_rest_menu_level1_city_%s values" % save_date_short
        self.sql2 = "insert into t_e_rest_menu_level2_city_%s_%s values" % (
            city_short, save_date_short)
        self.sql3 = "insert into t_e_rest_menu_level2_unique_city_%s_%s values" % (
            city_short, save_date_short)
        self.sql4 = "insert into t_e_rest_menu_discount_city_%s values" % save_date_short

        self.batch1 = BatchSql(self.sql1)
        self.batch2 = BatchSql(self.sql2)
        self.batch3 = BatchSql(self.sql3)
        self.batch4 = BatchSql(self.sql4)

        self.rest_food = defaultdict(set)

    def parse_data(self, data):
        try:
            rest_id = getMapValue(data, 'param')
            if rest_id in self.err_rest_ids:
                return
            for item in data["data"]:
                # 验证是否是错误数据
                try:
                    foods = item['foods'][::-1]
                except KeyError:
                    print('错误数据\n{}'.format(item))
                    continue
                menu_name = getMapValue(item, 'name')
                self.batch1.addBatch(
                    [self.city, self.date, rest_id, menu_name])
                if self.batch1.getSize() > 100000:
                    self.db.update(self.batch1)
                    self.batch1.cleanBatch()
                for item2 in foods:
                    food_id = getMapValue(item2, 'virtual_food_id')
                    food_price_current = getMapValue(item2["specfoods"][0],
                                                     'price')
                    food_price_primary = getMapValue(item2["specfoods"][0],
                                                     'original_price')
                    has_activity = '0'
                    if food_price_primary not in ['-999', '0']:
                        has_activity = '1'
                    self.batch2.addBatch([
                        self.city, self.date, rest_id, food_id,
                        getMapValue(item2, 'name'),
                        getMapValue(item2, 'month_sales'),
                        getMapValue(item2, 'rating'),
                        getMapValue(item2, 'satisfy_count'),
                        food_price_current, has_activity, menu_name
                    ])
                    if self.batch2.getSize() > 100000:
                        self.db.update(self.batch2)
                        self.batch2.cleanBatch()
                    if food_id not in self.rest_food[rest_id]:
                        self.rest_food[rest_id].add(food_id)
                        self.batch3.addBatch([
                            self.city, self.date, rest_id, food_id,
                            getMapValue(item2, 'name'),
                            getMapValue(item2, 'month_sales'),
                            getMapValue(item2, 'rating'),
                            getMapValue(item2, 'satisfy_count'),
                            food_price_current, has_activity, menu_name
                        ])
                        if self.batch3.getSize() > 100000:
                            self.db.update(self.batch3)
                            self.batch3.cleanBatch()
                        if food_price_primary not in ['-999', '0']:
                            discount = '%.6f' % (float(food_price_current) /
                                                 float(food_price_primary))
                            self.batch4.addBatch([
                                self.city, self.date, rest_id, food_id,
                                food_price_primary, food_price_current,
                                discount
                            ])
                            if self.batch4.getSize() > 100000:
                                self.db.update(self.batch4)
                                self.batch4.cleanBatch()
        except Exception as e:
            print("解析数据报错", e)
            print('解析数据报错\n错误数据:{}'.format(data))

    def run(self):
        f_name = os.path.join(self.resource_path, 'menu.pickle')
        print("解析文件:", f_name)
        with open(f_name, "rb+") as f:
            while 1:
                try:
                    data = pickle.load(f)
                    self.parse_data(data)
                    # break
                except EOFError as e:
                    print("文件读取完成", e)
                    break
                except Exception as e2:
                    print("报错", e2)
                    break
        if self.batch1.getSize() > 0:
            self.db.update(self.batch1)
            self.batch1.cleanBatch()
        if self.batch2.getSize() > 0:
            self.db.update(self.batch2)
            self.batch2.cleanBatch()
        if self.batch3.getSize() > 0:
            self.db.update(self.batch3)
            self.batch3.cleanBatch()
        if self.batch4.getSize() > 0:
            self.db.update(self.batch4)
            self.batch4.cleanBatch()
Пример #13
0
class ParseMenuObject(ParseDataObject):
    """
        category.pickle数据解析
    """
    def __init__(self,
                 resource_path,
                 city,
                 date,
                 rest_area,
                 err_rest_ids,
                 db,
                 is_test=False):
        ParseDataObject.__init__(self, resource_path, is_test)
        self.err_rest_ids = err_rest_ids
        self.rest_food = defaultdict(set)
        self.city = city
        self.date = date
        self.rest_area = rest_area

        self.db = db
        sql1 = "insert into t_e_rest_menu_level1_import values"
        sql2 = "insert into t_e_rest_menu_level2_import values"
        sql3 = "insert into t_e_rest_menu_level2_unique_import values"
        sql4 = "insert into t_e_rest_menu_discount_import values"

        self.batch1 = BatchSql(sql1)
        self.batch2 = BatchSql(sql2)
        self.batch3 = BatchSql(sql3)
        self.batch4 = BatchSql(sql4)

    def parse(self, generator=None):
        generator = generator if generator else self.GeneratorPickleData()
        try:
            for data in generator:
                self.parse_data(data)
            self.insertAll()
        except:
            print("解析数据报错:", traceback.print_exc())

    def insert(self, batch, param):
        batch.addBatch(param)
        if batch.getSize() > 100000:
            self.db.update(batch)
            batch.cleanBatch()

    def insertAll(self):
        if self.batch1.getSize() > 0:
            self.db.update(self.batch1)
            self.batch1.cleanBatch()
        if self.batch2.getSize() > 0:
            self.db.update(self.batch2)
            self.batch2.cleanBatch()
        if self.batch3.getSize() > 0:
            self.db.update(self.batch3)
            self.batch3.cleanBatch()
        if self.batch4.getSize() > 0:
            self.db.update(self.batch4)
            self.batch4.cleanBatch()

    def parse_data(self, data):
        try:
            rest_id = getMapValue(data, 'param')
            # TODO err_rest_ids需要传入
            if rest_id in self.err_rest_ids:
                return
            for item in data["data"]:
                # 验证是否是错误数据
                try:
                    foods = item['foods'][::-1]
                except KeyError:
                    print('错误数据\n{}'.format(item))
                    continue
                menu_name = getMapValue(item, 'name')
                param1 = [
                    self.city, self.date, self.rest_area, rest_id, menu_name
                ]
                self.insert(self.batch1, param1)

                for item2 in foods:
                    food_id = getMapValue(item2, 'virtual_food_id')
                    food_price_current = getMapValue(item2["specfoods"][0],
                                                     'price')
                    food_price_primary = getMapValue(item2["specfoods"][0],
                                                     'original_price')
                    has_activity = '0'
                    if food_price_primary not in ['-999', '0']:
                        has_activity = '1'
                    param2 = [
                        self.city, self.date, self.rest_area, rest_id, food_id,
                        getMapValue(item2, 'name'),
                        getMapValue(item2, 'month_sales'),
                        getMapValue(item2, 'rating'),
                        getMapValue(item2, 'satisfy_count'),
                        food_price_current, has_activity, menu_name
                    ]
                    self.insert(self.batch2, param2)

                    if food_id not in self.rest_food[rest_id]:
                        self.rest_food[rest_id].add(food_id)
                        param3 = [
                            self.city, self.date, self.rest_area, rest_id,
                            food_id,
                            getMapValue(item2, 'name'),
                            getMapValue(item2, 'month_sales'),
                            getMapValue(item2, 'rating'),
                            getMapValue(item2, 'satisfy_count'),
                            food_price_current, has_activity, menu_name
                        ]
                        self.insert(self.batch3, param3)

                        if food_price_primary not in ['-999', '0']:
                            discount = '%.6f' % (float(food_price_current) /
                                                 float(food_price_primary))
                            param4 = [
                                self.city, self.date, self.rest_area, rest_id,
                                food_id, food_price_primary,
                                food_price_current, discount
                            ]
                            self.insert(self.batch4, param4)

        except Exception as e:
            print("解析数据报错", traceback.print_exc())
            print('解析数据报错\n错误数据:{}'.format(data))
Пример #14
0
class ParseCategory(object):
    def __init__(self, resource_path, city, city_cn, date, date_short, db):
        '''
        :param resource_path: 城市数据文件路径
        :param city: 城市拼音
        :param city_cn: 城市中文
        :param date: 保存日期
        :param date_short: 保存日期简写
        :param db: 数据配置
        :return:
        '''
        self.resource_path = resource_path
        self.city = city
        self.city_cn = city_cn
        self.date = date
        self.date_short = date_short
        self.db = db

        self.sql1 = "insert into t_e_rest_list_city_pre_%s values" % date_short
        self.sql2 = "insert into t_e_rest_active_city_%s values" % date_short
        self.sql3 = "insert into t_e_rest_category_city_%s values" % date_short
        self.sql4 = "insert into t_e_rest_open_time_city_%s values" % date_short
        self.sql5 = "insert into t_e_rest_money_off_city_%s values" % date_short
        self.sql6 = "insert into t_e_rest_money_off_avg_city_%s values" % date_short

        self.batch1 = BatchSql(self.sql1)
        self.batch2 = BatchSql(self.sql2)
        self.batch3 = BatchSql(self.sql3)
        self.batch4 = BatchSql(self.sql4)
        self.batch5 = BatchSql(self.sql5)
        self.batch6 = BatchSql(self.sql6)

        self.err_rest_ids = set()
        self.rest_category = defaultdict(set)
        self.category_level1_list = []
        self.category_level2_list = []

    def get_data_by_level2_id(self, category_level2_id):
        category_level1_id, category_level1_name, category_level2_name = '-999', '-999', '-999'
        for item in self.category_level2_list:
            if category_level2_id == str(item.get('id2')):
                category_level2_name = str(item.get('name2'))
                category_level1_id = str(item.get('id1'))
                category_level1_name = str(item.get('name'))
                break
        # for item in category_level1_list:
        #     if category_level1_id == str(item[0]):
        #         category_level1_name = str(item[1])
        #         break
        return category_level1_id, category_level1_name, category_level2_id, category_level2_name

    def parse_data(self, data, m):
        '''
        :param data: 城市数据
        :param city: 城市拼音
        :param date: 保存日期
        :param m:    城市地图信息
        :param date_short: 保存日期简写
        :param db:   数据库配置信息
        :return:
        '''
        try:
            category_item = data['param'][2]
            category_level2_id = str(category_item.get("id2"))
            item = data["data"]
            rest_id = getMapValue(item, "id")
            lat = float(getMapValue(item, "latitude"))
            lng = float(getMapValue(item, "longitude"))
            if len(rest_id) > 11:
                print('错误店铺id\nrest_id: {}'.format(rest_id))
            elif not (m.is_in_city(lng, lat)):
                self.err_rest_ids.add(rest_id)
                print('错误地址\nrest_id: {}, lat: {}, lng: {}, name: {}\n'
                      'address: {}'.format(rest_id, lat, lng,
                                           getMapValue(item, "name"),
                                           getMapValue(item, "address")))
            else:
                # 店铺去重
                if not self.rest_category[rest_id]:
                    self.batch1.addBatch([
                        self.city, self.date, rest_id,
                        getMapValue(item, "name"),
                        getMapValue(item, "phone", '/'),
                        getMapValue(item, "address"),
                        getMapValue(item, "average_cost"),
                        [1, 0][getMapValue(item, "delivery_mode") == '-999'],
                        getMapValue(item, "float_delivery_fee"),
                        getMapValue(item, "float_minimum_order_amount"),
                        [0, 1][getMapValue(item, "is_new") == 'True'],
                        [0, 1][getMapValue(item, "is_premium") == 'True'], lat,
                        lng,
                        getMapValue(item, "recent_order_num"), '-999'
                    ])
                    if self.batch1.getSize() > 10000:
                        self.db.update(self.batch1)
                        self.batch1.cleanBatch()
                    opening_hours = getMapValue(item, "opening_hours")
                    arr = json.loads(opening_hours.replace("\'", "\""))
                    if not isinstance(arr, list):
                        print('错误数据:', rest_id, opening_hours)
                        print('错误数据\nrest_id: {}, opening_hours: {}'.format(
                            rest_id, opening_hours))
                    else:
                        for s in arr:
                            times = s.split("/")
                            self.batch4.addBatch(
                                [self.city, self.date, rest_id, *times])
                            if self.batch4.getSize() > 100000:
                                self.db.update(self.batch4)
                                self.batch4.cleanBatch()
                    # TODO 平均满减折扣有问题
                    discount_rate_list = []
                    for active in item.get('activities'):
                        attribute = getMapValue(active, "attribute")
                        active_type = getMapValue(active, "icon_name")
                        self.batch2.addBatch([
                            self.city, self.date, rest_id,
                            getMapValue(active, "description"), active_type
                        ])
                        if self.batch2.getSize() > 100000:
                            self.db.update(self.batch2)
                            self.batch2.cleanBatch()
                        if active_type == '减' and attribute != '-999':
                            arr = json.loads(attribute)
                            for key, value in arr.items():
                                if isinstance(value, dict):
                                    sub_price = value.get("1")
                                    if sub_price == 0:
                                        sub_price = value.get("0")
                                else:
                                    sub_price = value
                                key = round(float(key), 2)
                                sub_price = round(float(sub_price), 2)
                                discount_rate = round(sub_price / key, 4)
                                discount_rate_list.append(discount_rate)
                                self.batch5.addBatch([
                                    self.city, self.date, rest_id, key,
                                    sub_price, discount_rate
                                ])
                                if self.batch5.getSize() > 100000:
                                    self.db.update(self.batch5)
                                    self.batch5.cleanBatch()

                    # TODO 平均满减折扣有问题
                    if len(discount_rate_list):
                        avg_discount_rate = sum(discount_rate_list) / len(
                            discount_rate_list)
                        avg_discount_rate = round(avg_discount_rate, 4)
                        self.batch6.addBatch(
                            [self.city, self.date, rest_id, avg_discount_rate])
                        if self.batch6.getSize() > 100000:
                            self.db.update(self.batch6)
                            self.batch6.cleanBatch()

                # 店铺对应品类去重
                if category_level2_id not in self.rest_category[rest_id]:
                    self.rest_category[rest_id].add(category_level2_id)
                    #TODO 取消了单独的品类文件,品类信息和店铺信息保存一起,可直接获取该店铺对应品类信息
                    # category_level1_id, category_level1_name, category_level2_id, category_level2_name = get_data_by_level2_id(category_level2_id)

                    category_level1_id = category_item.get('id1')
                    category_level1_name = category_item.get('name')
                    category_level2_name = category_item.get('name2')
                    self.batch3.addBatch([
                        self.city, self.date, category_level1_id,
                        category_level1_name, category_level2_id,
                        category_level2_name, rest_id,
                        getMapValue(item, "name")
                    ])
                    if self.batch3.getSize() > 100000:
                        self.db.update(self.batch3)
                        self.batch3.cleanBatch()
        except Exception as e:
            print("解析数据报错", traceback.format_exc())
            print('解析数据报错\n错误数据:{}'.format(data))

    def run(self):
        from ShopMonitor.elm.ElmWebParse.ElmMapUtil import ElmMapUtil
        m = ElmMapUtil(self.city_cn)
        f_name = os.path.join(self.resource_path, 'category.pickle')
        print("解析rest文件:", f_name)
        with open(f_name, "rb+") as f:
            while 1:
                try:
                    data = pickle.load(f)
                    self.parse_data(data, m)
                    # parse_data(data, city, date,m,date_short,db)
                except EOFError as e:
                    print("rest文件读取完成", e)
                    break
                except Exception as e2:
                    print("报错", e2)
                    break

        if self.batch1.getSize() > 0:
            self.db.update(self.batch1)
            self.batch1.cleanBatch()
        if self.batch2.getSize() > 0:
            self.db.update(self.batch2)
            self.batch2.cleanBatch()
        if self.batch3.getSize() > 0:
            self.db.update(self.batch3)
            self.batch3.cleanBatch()
        if self.batch4.getSize() > 0:
            self.db.update(self.batch4)
            self.batch4.cleanBatch()
        if self.batch5.getSize() > 0:
            self.db.update(self.batch5)
            self.batch5.cleanBatch()
        if self.batch6.getSize() > 0:
            self.db.update(self.batch6)
            self.batch6.cleanBatch()

        print('rest_list数:', len(self.rest_category.keys()))
        print('rest_category数:',
              sum([len(x) for x in self.rest_category.values()]))

        print(self.city_cn, "错误店铺数:", len(self.err_rest_ids))
        # with open("err_rest_ids_" + city_cn + ".txt", "w") as f:
        #     for item in err_rest_ids:
        #         f.write(str(item) + '\n')
        return self.err_rest_ids
Пример #15
0
class ParseRatingObject(ParseDataObject):
    """
        category.pickle数据解析
    """
    def __init__(self, resource_path, city, date, rest_area, db, is_test=False):
        ParseDataObject.__init__(self, resource_path, is_test)
        self.err_rest_ids = set()
        self.city = city
        self.date = date
        self.rest_area = rest_area

        self.db = db

        sql1 = "insert into t_e_rest_rating_import values"
        sql2 = "insert into t_e_rest_food_rating_import values"

        self.batch1 = BatchSql(sql1)
        self.batch2 = BatchSql(sql2)

    def parse(self, generator=None):
        generator = generator if generator else self.GeneratorPickleData()
        try:
            for data in generator:
                self.parse_data(data)
            self.insertAll()
        except:
            print("解析数据报错:", traceback.print_exc())

    def insert(self,batch,param):
        batch.addBatch(param)
        if batch.getSize() > 100000:
            self.db.update(batch)
            batch.cleanBatch()

    def insertAll(self):
        if self.batch1.getSize() > 0:
            self.db.update(self.batch1)
            self.batch1.cleanBatch()
        if self.batch2.getSize() > 0:
            self.db.update(self.batch2)
            self.batch2.cleanBatch()


    def parse_data(self,data):
        print("data",data)
        try:
            rest_id = str(data["param"])
            if len(rest_id) <= 11:
                item1 = data["data"]

                print('item1',item1)
                rating_id = str(uuid.uuid1())
                print("rating_id",rating_id)

                rate_time = getMapValue(item1, "rated_at")
                rating_text = getMapValue(item1, "rating_text")
                reply_text = getMapValue(item1, "reply_text")
                rating_star = getMapValue(item1, "rating_star")
                username = getMapValue(item1, "username")

                param1 = [self.city, self.date,self.rest_area,rest_id,rating_id, rate_time,
                            rating_text,reply_text,rating_star,username]

                self.insert(self.batch1,param1)
                for item2 in item1["item_ratings"]:
                    food_id = getMapValue(item2, "food_id")
                    food_name = getMapValue(item2,"food_name")
                    image_hash = getMapValue(item2, "image_hash")

                    param2 = [self.city, self.date,self.rest_area, rest_id, rating_id,rate_time,
                              food_id,image_hash,food_name,rating_star,rating_text,reply_text]

                    self.insert(self.batch2, param2)
            else:
                print("错误数据", data)

        except Exception as e:
            print(traceback.print_exc())
            print('解析数据报错\n错误数据:{}'.format(data))