def get_phone_total(): total_count = Commodity.select(fn.SUM(Commodity.total).alias('tc')).dicts()[0]['tc'] commodity_count = Commodity.select().count() brand_count = Commodity.select().group_by(Commodity.brand).count() model_count = Commodity.select().group_by(Commodity.model).count() PhoneTotal.create( total_count=total_count, commodity_count=commodity_count, brand_count=brand_count, model_count=model_count )
def get_phone_size(): screen_size_list = [] width_list = [] thickness_list = [] length_list = [] weight_list = [] for commodity in Commodity.select().where(Commodity.os.not_in(['功能机', '页面未注明'])): if commodity.screen_size is not None: screen_size_list.append(commodity.screen_size) if commodity.width is not None: width_list.append(commodity.width) if commodity.thickness is not None: thickness_list.append(commodity.thickness) if commodity.length is not None: length_list.append(commodity.length) if commodity.weight is not None: weight_list.append(commodity.weight) PhoneSize.create( screen_size_avg=calculate_average(screen_size_list, True), screen_size_med=calculate_median(screen_size_list, True), width_avg=calculate_average(width_list, True), width_med=calculate_median(width_list, True), thickness_avg=calculate_average(thickness_list, True), thickness_med=calculate_median(thickness_list, True), length_avg=calculate_average(length_list, True), length_med=calculate_median(length_list, True), weight_avg=calculate_average(weight_list, True), weight_med=calculate_median(weight_list, True) )
def get_phone_platform(): source_list = ['京东', '苏宁'] for source in source_list: total_count = Commodity.select(fn.SUM(Commodity.total).alias('tc')) \ .where(Commodity.source == source).dicts()[0]['tc'] commodity_count = Commodity.select().where(Commodity.source == source).count() self_count = Commodity.select(fn.SUM(Commodity.total).alias('tc')) \ .where((Commodity.source == source) & (Commodity.is_self == True)).dicts()[0]['tc'] non_self_count = Commodity.select(fn.SUM(Commodity.total).alias('tc')) \ .where((Commodity.source == source) & (Commodity.is_self == False)).dicts()[0]['tc'] PhonePlatform.create( source=source, total_count=total_count, commodity_count=commodity_count, self_percentage=calculate_percentage(total_count, self_count), non_self_percentage=calculate_percentage(total_count, non_self_count) )
def get_soc(): for commodity in Commodity.select(): try: soc = SoC.get( SoC.soc_mfrs == commodity.soc_mfrs, SoC.soc_model == commodity.soc_model ) soc.total += commodity.total soc.save() except SoC.DoesNotExist: SoC.create( soc_mfrs=commodity.soc_mfrs, soc_model=commodity.soc_model, total=commodity.total )
def get_phone(): for commodity in Commodity.select(): try: phone = Phone.get( Phone.brand == commodity.brand, Phone.model == commodity.model ) phone.total += commodity.total phone.save() except Phone.DoesNotExist: Phone.create( brand=commodity.brand, model=commodity.model, total=commodity.total )
def get_phone_price_and_sales(): # PhonePriceAndSales.create( # type='功能机', # price_range='500-1000元', # total=0 # ) # for commodity in Commodity.select().where(Commodity.os.in_(['Android', 'iOS'])): # if 0 < commodity.price < 500: # ppas = PhonePriceAndSales.get_by_id(1) # ppas.total += commodity.total # ppas.save() # if 500 <= commodity.price < 1000: # ppas = PhonePriceAndSales.get_by_id(2) # ppas.total += commodity.total # ppas.save() # if 1000 <= commodity.price < 2000: # ppas = PhonePriceAndSales.get_by_id(3) # ppas.total += commodity.total # ppas.save() # if 2000 <= commodity.price < 3000: # ppas = PhonePriceAndSales.get_by_id(4) # ppas.total += commodity.total # ppas.save() # if 3000 <= commodity.price < 4000: # ppas = PhonePriceAndSales.get_by_id(5) # ppas.total += commodity.total # ppas.save() # if 4000 <= commodity.price < 5000: # ppas = PhonePriceAndSales.get_by_id(6) # ppas.total += commodity.total # ppas.save() # if 5000 <= commodity.price < 8000: # ppas = PhonePriceAndSales.get_by_id(7) # ppas.total += commodity.total # ppas.save() # if commodity.price >= 8000: # ppas = PhonePriceAndSales.get_by_id(8) # ppas.total += commodity.total # ppas.save() # total_count = PhonePriceAndSales.select(fn.SUM(PhonePriceAndSales.total).alias('tc')) \ # .where(PhonePriceAndSales.type == '智能手机').dicts()[0]['tc'] # print(total_count) # for ppas in PhonePriceAndSales.select().where(PhonePriceAndSales.type == '智能手机'): # ppas.percentage = calculate_percentage(total_count, ppas.total) # ppas.save() for commodity in Commodity.select().where(Commodity.os.in_(['功能机'])): if 0 < commodity.price < 100: ppas = PhonePriceAndSales.get_by_id(9) ppas.total += commodity.total ppas.save() if 100 <= commodity.price < 200: ppas = PhonePriceAndSales.get_by_id(10) ppas.total += commodity.total ppas.save() if 200 <= commodity.price < 300: ppas = PhonePriceAndSales.get_by_id(11) ppas.total += commodity.total ppas.save() if 300 <= commodity.price < 400: ppas = PhonePriceAndSales.get_by_id(12) ppas.total += commodity.total ppas.save() if 400 <= commodity.price < 500: ppas = PhonePriceAndSales.get_by_id(13) ppas.total += commodity.total ppas.save() if 500 <= commodity.price < 1000: ppas = PhonePriceAndSales.get_by_id(14) ppas.total += commodity.total ppas.save() total_count = PhonePriceAndSales.select(fn.SUM(PhonePriceAndSales.total).alias('tc')) \ .where(PhonePriceAndSales.type == '功能机').dicts()[0]['tc'] print(total_count) for ppas in PhonePriceAndSales.select().where(PhonePriceAndSales.type == '功能机'): ppas.percentage = calculate_percentage(total_count, ppas.total) ppas.save()
def preprocess_data(): # 删除销量为0的无效数据 # query = Commodity.delete().where(Commodity.total == 0) # query.execute() # 找出销量被标记为-1的商品 # for commodity in Commodity.select().where(Commodity.total == -1): # print(commodity.url) # 找出价格被标记为-1和-2的商品 # for commodity in Commodity.select().where(Commodity.price.in_([-1, -2])): # print(commodity.url) # 统一品牌名 for commodity in Commodity.select().group_by(Commodity.brand): print(commodity.brand) # query = Commodity.update(brand='21克').where(Commodity.brand.in_(['21KE', '21克(21ke)'])) # query.execute() # query = Commodity.update(brand='百合').where(Commodity.brand.in_(['BIHEE'])) # query.execute() # query = Commodity.update(brand='征服').where(Commodity.brand.in_(['CONQUEST'])) # query.execute() # query = Commodity.update(brand='龙贝尔').where(Commodity.brand.in_(['LBER'])) # query.execute() # query = Commodity.update(brand='新路虎').where(Commodity.brand.in_(['LGRAVER'])) # query.execute() # query = Commodity.update(brand='硕王').where(Commodity.brand.in_(['SAILF'])) # query.execute() # query = Commodity.update(brand='优畅想').where(Commodity.brand.in_(['U-Magic'])) # query.execute() # query = Commodity.update(brand='纬图').where(Commodity.brand.in_(['VERTU'])) # query.execute() # query = Commodity.update(brand='索野').where(Commodity.brand.in_(['soyes', '索野(SOYES)', '索野(SOYES)'])) # query.execute() # query = Commodity.update(brand='京崎').where(Commodity.brand.in_(['京崎(TOOKY)'])) # query.execute() # query = Commodity.update(brand='依偎').where(Commodity.brand.in_(['依偎(ivvi)'])) # query.execute() # query = Commodity.update(brand='克里特').where(Commodity.brand.in_(['克里特(kreta)'])) # query.execute() # query = Commodity.update(brand='北斗天地').where(Commodity.brand.in_(['北斗天地(BDTD)'])) # query.execute() # query = Commodity.update(brand='守护宝(上海中兴)').where(Commodity.brand.in_(['守护宝', '守护宝(angel care)'])) # query.execute() # query = Commodity.update(brand='尼凯恩').where(Commodity.brand.in_(['尼凯恩(neken)'])) # query.execute() # query = Commodity.update(brand='康佳').where(Commodity.brand.in_(['康佳(KONKA)', '康佳(KONKA)'])) # query.execute() # query = Commodity.update(brand='捷语').where(Commodity.brand.in_(['捷语(GoFly)'])) # query.execute() # query = Commodity.update(brand='朵唯').where(Commodity.brand.in_(['朵唯(DOOV)', '朵唯(DOOV)'])) # query.execute() # query = Commodity.update(brand='天语').where(Commodity.brand.in_(['天语(K-Touch)', '天语(K-TOUCH)', '天语(K-Touch)'])) # query.execute() # query = Commodity.update(brand='欧奇').where(Commodity.brand.in_(['欧奇(OUKI)'])) # query.execute() # query = Commodity.update(brand='波导').where(Commodity.brand.in_(['波导(BiRD)'])) # query.execute() # query = Commodity.update(brand='海信').where(Commodity.brand.in_(['海信(Hisense)', '海信(Hisense)'])) # query.execute() # query = Commodity.update(brand='海语').where(Commodity.brand.in_(['海语(HAIYU)'])) # query.execute() # query = Commodity.update(brand='爱户外').where(Commodity.brand.in_(['爱户外(ioutdoor)'])) # query.execute() # query = Commodity.update(brand='糖果').where(Commodity.brand.in_(['糖果(sugar)'])) # query.execute() # query = Commodity.update(brand='索爱').where(Commodity.brand.in_(['索爱(soaiy)', '索爱(soaiy)'])) # query.execute() # query = Commodity.update(brand='纽万').where(Commodity.brand.in_(['纽万(NEWONE)'])) # query.execute() # query = Commodity.update(brand='纽曼').where(Commodity.brand.in_(['纽曼(Newman)', '纽曼(Newsmy)', '纽曼(Newman)'])) # query.execute() # query = Commodity.update(brand='詹姆士').where(Commodity.brand.in_(['詹姆士(GEMRY)', '詹姆士(GERMY)'])) # query.execute() # query = Commodity.update(brand='誉品').where(Commodity.brand.in_(['誉品(YEPEN)', '誉品(YEPEN)'])) # query.execute() # query = Commodity.update(brand='诺亚信').where(Commodity.brand.in_(['诺亚信(NOAIN)'])) # query.execute() # query = Commodity.update(brand='遨游').where(Commodity.brand.in_(['遨游(AORO)'])) # query.execute() # query = Commodity.update(brand='酷和').where(Commodity.brand.in_(['酷和(KUH)'])) # query.execute() # query = Commodity.update(brand='酷比').where(Commodity.brand.in_(['酷比(koobee)', '酷比(koobee)'])) # query.execute() # query = Commodity.update(brand='酷派').where(Commodity.brand.in_(['酷派(Coolpad)', '酷派(Coolpad)'])) # query.execute() # query = Commodity.update(brand='金伯利').where(Commodity.brand.in_(['金伯利(JINBOLI)'])) # query.execute() # query = Commodity.update(brand='金立').where(Commodity.brand.in_(['金立(GiONEE)', '金立(GiONEE)', '金立(Gionee)'])) # query.execute() # query = Commodity.update(brand='青橙').where(Commodity.brand.in_(['青橙(Green Orange)'])) # query.execute() # query = Commodity.update(brand='飞利浦').where(Commodity.brand.in_(['飞利浦(Philips)', '飞利浦(PHILIPS )'])) # query.execute() # # query = Commodity.update(brand='苹果').where(Commodity.brand.in_(['APPLE', 'Apple', 'iPhone', '苹果(Apple)'])) # query.execute() # query = Commodity.update(brand='荣耀').where(Commodity.brand.in_(['HONOR', '荣耀(honor)', '荣耀(HONOR)', '荣耀(honor)'])) # query.execute() # query = Commodity.update(brand='小米').where(Commodity.brand.in_(['XIAOMI', '小米(mi)', '小米(MI)'])) # query.execute() # query = Commodity.update(brand='OPPO').where(Commodity.brand.in_(['oppo'])) # query.execute() # query = Commodity.update(brand='vivo').where(Commodity.brand.in_(['维沃(vivo)', 'iQOO'])) # query.execute() # query = Commodity.update(brand='三星').where(Commodity.brand.in_(['三星(SAMSUNG)', '三星(SAMSUNG)'])) # query.execute() # query = Commodity.update(brand='中兴').where(Commodity.brand.in_(['中兴(ZTE)', '中兴(ZTE)'])) # query.execute() # query = Commodity.update(brand='努比亚').where(Commodity.brand.in_(['努比亚(nubia)', '努比亚(nubia)'])) # query.execute() # query = Commodity.update(brand='华为').where(Commodity.brand.in_(['华为(HUAWEI)', '华为(HUAWEI)', '麦芒'])) # query.execute() # query = Commodity.update(brand='锤子/坚果').where(Commodity.brand.in_(['坚果', '锤子(smartisan)'])) # query.execute() # query = Commodity.update(brand='多亲').where(Commodity.brand.in_(['多亲(QIN)', '多亲(QIN)'])) # query.execute() # query = Commodity.update(brand='摩托罗拉').where(Commodity.brand.in_(['摩托罗拉(MOTOROLA)', '摩托罗拉(Motorola)'])) # query.execute() # query = Commodity.update(brand='柔宇').where(Commodity.brand.in_(['柔宇(ROYOLE)', '柔宇(ROYOLE)'])) # query.execute() # query = Commodity.update(brand='realme').where(Commodity.brand.in_(['真我(realme)'])) # query.execute() # query = Commodity.update(brand='索尼').where(Commodity.brand.in_(['索尼(SONY)', '索尼(SONY)', 'SONY'])) # query.execute() # query = Commodity.update(brand='美图').where(Commodity.brand.in_(['美图(Meitu)'])) # query.execute() # query = Commodity.update(brand='联想').where(Commodity.brand.in_(['联想(Lenovo)', '联想(LENOVO)', '联想(Lenovo)', # '联想(lenovo)'])) # query.execute() # query = Commodity.update(brand='诺基亚').where(Commodity.brand.in_(['诺基亚(NOKIA)', '诺基亚(NOKIA)'])) # query.execute() # query = Commodity.update(brand='魅族').where(Commodity.brand.in_(['魅族(MEIZU)', '魅族(MEIZU)', '魅族(meizu)'])) # query.execute() # 规范产品名 # name_list = ['黑鲨4 Pro'] # Commodity.update(model='黑鲨游戏手机4 Pro').where(Commodity.model.in_(name_list)).execute() # 规范操作系统名和SoC信息 # target_brand = '荣耀' # model_list = ['荣耀畅玩9A'] # Commodity.update({ # Commodity.os: 'Android', # Commodity.soc_mfrs: '联发科', # Commodity.soc_model: 'P35' # }).where((Commodity.model.in_(model_list)) & (Commodity.brand == target_brand)).execute() # for commodity in Commodity.select().where(Commodity.brand == target_brand).group_by(Commodity.model): # print(commodity.model) # for commodity in Commodity.select().where(Commodity.brand == target_brand).group_by(Commodity.soc_mfrs): # print(commodity.soc_mfrs) # for commodity in Commodity.select().where(Commodity.brand == target_brand).group_by(Commodity.soc_model): # print(commodity.soc_model) Commodity.update({Commodity.soc_model: '未知'}).where( Commodity.soc_model.in_(['其它', '其他', '以官网信息为准', '以官方信息为准'])).execute() for commodity in Commodity.select().group_by(Commodity.soc_model): print(commodity.soc_model)
def insert_jd_all_commodity(browser: Chrome): for target_sku in TargetSku.select().where(TargetSku.source == '京东'): # 获取当前商品SKU编号 sku: str = target_sku.sku # 检查当前SKU是否在数据库中保存的SKU中, 避免销量重复计数 result = ExistedSku.get_or_none(ExistedSku.source == '京东', ExistedSku.sku == sku) if result is not None: # 删除已经保存的商品target_sku delete_saved_commodity_sku(sku) print(f'---SKU编号为 {sku} 的商品信息已保存过---') continue # 开始抓取商品信息 commodity = Commodity() commodity.source = '京东' commodity.url = 'https://item.jd.com/' + sku + '.html' # 打开并切换到当前商品页面 switch_to_current_sku_page(browser, commodity.url) # 从后端API接口获取并保存已上架的SKU get_jd_sku_from_api(browser, sku) try: commodity.price = float( browser.find_element_by_css_selector( 'span.price:nth-child(2)').text) except (ValueError, NoSuchElementException): # 价格显示为待发布时或商品以下柜时, 抛出异常 commodity.price = -1 try: commodity.title = browser.find_element_by_class_name( 'sku-name').text.strip() except NoSuchElementException: commodity.title = '无商品标题' try: total_str = browser.find_element_by_css_selector( '#comment-count > a').text commodity.total = parse_jd_count_str(total_str) except NoSuchElementException: # 商品为预约状态时销量不显示在价格旁边, 抛出异常 commodity.total = -1 # 判断是否为京东自营 try: self_str = browser.find_element_by_class_name('u-jd').text if self_str == '自营': self = True else: self = False except NoSuchElementException: self = False commodity.is_self = self try: commodity.shop_name = browser.find_element_by_css_selector( '#crumb-wrap > div > div.contact.fr.clearfix > div.J-hove-wrap.EDropdown.fr > div:nth-child(1) > div ' '> a').text except NoSuchElementException: commodity.shop_name = '店铺名称为空' # 从商品介绍中获取商品信息 try: commodity.brand = browser.find_element_by_css_selector( '#parameter-brand > li > a').text except NoSuchElementException: commodity.brand = '品牌未注明' intro = browser.find_elements_by_css_selector('.parameter2 > li') intro_list = [] for i in intro: intro_list.append(i.text) # 预赋值, 防止注入空置报错 commodity.os = '页面未注明' commodity.model = '页面未注明' for intro_item in intro_list: if '操作系统' in intro_item: commodity.os = intro_item.replace('操作系统:', '') if 'CPU型号' in intro_item: commodity.soc_model = intro_item.replace('CPU型号:', '') if '商品名称' in intro_item: commodity.model = intro_item.replace('商品名称:', '') # 下滑点击 规格与包装 选项 window_scroll_by(browser, 1200) js_script = 'document.querySelector("#detail > div.tab-main.large > ul > li:nth-child(2)").click()' browser.execute_script(js_script) sleep(1) # 从 规格与包装 中获取商品信息 spec_list = browser.find_elements_by_class_name('Ptable-item') for spec_item in spec_list: spec_item_title = spec_item.find_element_by_tag_name('h3').text item_list = spec_item.find_elements_by_class_name('clearfix') if '主体' == spec_item_title: for item in item_list: item_name = item.find_element_by_tag_name('dt').text item_value = item.find_element_by_tag_name('dd').text if '产品名称' == item_name: commodity.model = item_value if '基本信息' == spec_item_title: for item in item_list: item_name = item.find_element_by_tag_name('dt').text item_value = item.find_element_by_tag_name('dd').text.replace('mm', '').replace('MM', '')\ .replace('mM', '').replace('Mm', '').replace('g', '').replace('G', '').replace('约', '')\ .replace('大约', '').replace('左右', '').replace('大概', '').strip() try: if '机身宽度' in item_name: commodity.width = float(item_value) if '机身厚度' in item_name: commodity.thickness = float(item_value) if '机身长度' in item_name: commodity.length = float(item_value) if '机身重量' in item_name: commodity.weight = float(item_value) except ValueError: pass if '主芯片' == spec_item_title: for item in item_list: item_name = item.find_element_by_tag_name('dt').text item_value = item.find_element_by_tag_name('dd').text if 'CPU品牌' == item_name: commodity.soc_mfrs = item_value if '屏幕' == spec_item_title: for item in item_list: item_name = item.find_element_by_tag_name('dt').text item_value_str = item.find_element_by_tag_name('dd').text if '主屏幕尺寸' in item_name: try: item_value = float( item_value_str.replace('英寸', '').strip()) commodity.screen_size = item_value except ValueError: pass # 保存商品信息 commodity.save() # 删除已经保存的商品target_sku delete_saved_commodity_sku(sku) print(f'------SKU编号为 {sku} 的商品信息保存完毕------') # 回到手机分类页面 back_to_first_window(browser)
def insert_sn_all_commodity(browser: Chrome): for target_sku in SNTargetSku.select(): # 获取当前商品店铺代码和SKU编号 shop_code: str = target_sku.shop_code sku: str = target_sku.sku # 检查当前SKU是否在数据库中保存的SKU中, 避免销量重复计数 result = SNExistedSku.get_or_none(SNExistedSku.shop_code == shop_code, SNExistedSku.sku == sku) if result is not None: # 删除已经保存的商品target_sku delete_saved_commodity_sku(shop_code, sku) print(f'---SKU编号为 {sku} 的商品信息已保存过---') continue # 开始抓取商品信息 commodity = Commodity() commodity.source = '苏宁' commodity.url = 'https://product.suning.com/' + shop_code + '/' + sku + '.html' # 打开并切换到当前商品页面 switch_to_current_sku_page(browser, commodity.url) # 从后端API接口获取所有SKU和销量 commodity.total = get_sn_sku_and_total_from_api( browser, shop_code, sku) # 判断是否为苏宁自营, 苏宁自营的店铺码为 0000000000 if int(shop_code) == 0: commodity.is_self = True else: commodity.is_self = False try: commodity.title = browser.find_element_by_id( 'itemDisplayName').text except NoSuchElementException: commodity.title = '无商品标题' try: commodity.price = float( browser.find_element_by_class_name('mainprice').text.replace( '¥', '')) except (ValueError, NoSuchElementException): commodity.price = -2 try: commodity.shop_name = browser.find_element_by_class_name( 'header-shop-name').text except NoSuchElementException: commodity.shop_name = '店铺名称为空' # 从商品介绍中获取商品信息 # 预赋值, 防止注入空置报错 commodity.brand = '页面未注明' commodity.model = '页面未注明' commodity.os = '页面未注明' intro_list = browser.find_elements_by_css_selector( '#phoneParameters > ul > li') for intro in intro_list: intro_title = intro.find_element_by_tag_name('p').text items = intro.find_elements_by_css_selector( 'dl > dd > div > ul > li') if intro_title == '屏幕': for item in items: if '屏幕尺寸' in item.text: commodity.screen_size = float( item.text.replace('屏幕尺寸:', '').replace('英寸', '').strip()) if intro_title == 'CPU': for item in items: if 'CPU型号' in item.text: commodity.soc_model = item.text.replace('CPU型号:', '') # 下滑点击 包装及参数 选项 window_scroll_by(browser, 1500) browser.execute_script( 'document.querySelector("#productParTitle > a").click()') sleep(1) # 从 规格与包装 中获取商品信息 spec_list = browser.find_elements_by_css_selector( '#itemParameter > tbody > tr') for spec in spec_list: if spec.get_attribute('parametercode') is not None: spec_name = spec.find_element_by_tag_name('span').text spec_value = spec.find_element_by_class_name('val').text if spec_name == '品牌': commodity.brand = spec_value if spec_name == '型号': commodity.model = spec_value if spec_name == '手机操作系统': commodity.os = spec_value if spec_name == 'CPU品牌': commodity.soc_mfrs = spec_value if spec_name == 'CPU型号': commodity.soc_model = spec_value try: spec_val = spec_value.replace('mm', '').replace('MM', '').replace('毫米', '').replace('英寸', '')\ .replace('mM', '').replace('Mm', '').replace('g', '').replace('G', '').replace('约', '')\ .replace('大约', '').replace('左右', '').replace('克', '').replace('寸', '').strip() if spec_name == '屏幕尺寸': commodity.screen_size = float(spec_val) if spec_name == '机身长度': commodity.length = float(spec_val) if spec_name == '机身宽度': commodity.width = float(spec_val) if spec_name == '机身厚度': commodity.thickness = float(spec_val) if spec_name == '重量': commodity.weight = float(spec_val) except ValueError: pass # 保存商品信息 commodity.save() # 删除已经保存的商品target_sku delete_saved_commodity_sku(shop_code, sku) print(f'------SKU编号为 {sku} 的商品信息保存完毕------') # 回到手机分类页面 back_to_first_window(browser) sleep(2)