def _deal_with_jd_goods(goods_link, my_lg): ''' 处理jd商品 :param goods_link: :return: ''' my_lg.info('进入京东商品处理接口...') goods_id = _get_jd_goods_id(goods_link) if goods_id == '': msg = 'goods_id匹配失败!请检查url是否正确!' return _error_data(msg=msg) jd_url = 'https://item.jd.com/{0}.html'.format(goods_id) data = get_one_jd_data(wait_to_deal_with_url=jd_url, my_lg=my_lg) if data.get('msg', '') == 'data为空!': msg = '该goods_id:{0}, 抓取数据失败!'.format(goods_id) return _error_data(msg=msg) else: pass site_id = _from_jd_type_get_site_id(type=data.get('jd_type')) data = _get_right_model_data(data=data, site_id=site_id, logger=my_lg) my_pipeline = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('------>>>| 正在存储的数据为: ' + data.get('goods_id', '')) params = _get_db_jd_insert_params(item=data) sql_str = 'insert into dbo.GoodsInfoAutoGet(GoodsID, GoodsUrl, UserName, CreateTime, ModfiyTime, ShopName, Account, GoodsName, SubTitle, LinkName, Price, TaoBaoPrice, PriceInfo, SKUName, SKUInfo, ImageUrl, PropertyInfo, DetailInfo, SellCount, SiteID, IsDelete) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' is_insert_into = my_pipeline._insert_into_table_2(sql_str=sql_str, params=params, logger=my_lg) if is_insert_into: # 如果返回值为True pass else: # 不处理存储结果 # msg = '存储该goods_id:{0}失败!'.format(goods_id) # return _error_data(msg=msg) pass return compatible_api_goods_data(data=data, my_lg=my_lg)
def _deal_with_tb_goods(goods_link, my_lg): ''' 处理淘宝商品 :param goods_link: :return: json_str ''' my_lg.info('进入淘宝商品处理接口...') goods_id = _get_tb_goods_id(goods_link) if goods_id == '': msg = 'goods_id匹配失败!请检查url是否正确!' return _error_data(msg=msg) tb_url = 'https://item.taobao.com/item.htm?id=' + goods_id # 构造成标准干净的淘宝商品地址 data = get_one_tb_data(tb_url=tb_url) my_lg.info(str(data)) if data.get('msg', '') == 'data为空!': msg = '该goods_id:{0}, 抓取数据失败!'.format(goods_id) return _error_data(msg=msg) else: pass data = _get_right_model_data(data=data, site_id=1, logger=my_lg) my_pipeline = SqlServerMyPageInfoSaveItemPipeline() my_lg.info('------>>>| 正在存储的数据为: ' + data.get('goods_id', '')) params = _get_db_taobao_insert_params(item=data) sql_str = 'insert into dbo.GoodsInfoAutoGet(GoodsID, GoodsUrl, UserName, CreateTime, ModfiyTime, ShopName, Account, GoodsName, SubTitle, LinkName, Price, TaoBaoPrice, PriceInfo, SKUName, SKUInfo, ImageUrl, PropertyInfo, DetailInfo, SellCount, SiteID, IsDelete) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' is_insert_into = my_pipeline._insert_into_table_2(sql_str=sql_str, params=params, logger=my_lg) if is_insert_into: # 如果返回值为True pass else: # 不处理存储结果! # msg = '存储该goods_id:{0}失败!'.format(goods_id) # return _error_data(msg=msg) pass return compatible_api_goods_data(data=data, my_lg=my_lg)
class GoodsKeywordsSpider(object): def __init__(self): self._set_logger() self.msg = '' self._init_debugging_api() self.debugging_api = self._init_debugging_api() self._set_func_name_dict() self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() # 插入数据到goods_id_and_keyword_middle_table表 self.add_keyword_id_for_goods_id_sql_str = kw_insert_str_1 def _set_logger(self): self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/goods_keywords/_/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR) def _init_debugging_api(self): ''' 用于设置crawl的关键字热销商品的site_id :return: dict ''' return { 1: True, # 淘宝 2: True, # 阿里1688 3: True, # 天猫 4: True, # 京东 } def _set_func_name_dict(self): self.func_name_dict = { 'taobao': 'self._taobao_keywords_spider(goods_id_list={0}, keyword_id={1})', 'ali': 'self._ali_keywords_spider(goods_id_list={0}, keyword_id={1})', 'tmall': 'self._tmall_keywords_spider(goods_id_list={0}, keyword_id={1})', 'jd': 'self._jd_keywords_spider(goods_id_list={0}, keyword_id={1})' } def _just_run(self): while True: # 获取原先goods_db的所有已存在的goods_id try: result = list( self.my_pipeline._select_table(sql_str=kw_select_str_1)) self.my_lg.info('正在获取db中已存在的goods_id...') result_2 = list( self.my_pipeline._select_table(sql_str=kw_select_str_2)) self.my_lg.info('db中已存在的goods_id获取成功!') except TypeError: self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None result_2 = None if result is not None and result_2 is not None: self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') self.my_lg.info(str(result)) self.my_lg.info( '--------------------------------------------------------') self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) self.add_goods_index = 0 # 用于定位增加商品的个数 self.db_existed_goods_id_list = [item[0] for item in result_2] # 即时释放资源 try: del result_2 except: pass gc.collect() for item in result: # 每个关键字在True的接口都抓完, 再进行下一次 self.my_lg.info('正在处理id为{0}, 关键字为 {1} ...'.format( item[0], item[1])) for type, type_value in self.debugging_api.items( ): # 遍历待抓取的电商分类 if type_value is False: self.my_lg.info('api为False, 跳过!') continue if self.add_goods_index % 20 == 0: self.my_lg.info('my_pipeline客户端重连中...') try: del self.my_pipeline except: pass self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline( ) self.my_lg.info('my_pipeline客户端重连完毕!') goods_id_list = self._get_keywords_goods_id_list( type=type, keyword=item) self.my_lg.info( '关键字为{0}, 获取到的goods_id_list 如下: {1}'.format( item[1], str(goods_id_list))) '''处理goods_id_list''' self._deal_with_goods_id_list( type=type, goods_id_list=goods_id_list, keyword_id=item[0]) sleep(3) def _get_keywords_goods_id_list(self, type, keyword): ''' 获取goods_id_list :param type: 电商种类 :param keyword: :return: ''' if type == 1: self.my_lg.info('下面是淘宝的关键字采集...') goods_id_list = self._get_taobao_goods_keywords_goods_id_list( keyword=keyword) elif type == 2: self.my_lg.info('下面是阿里1688的关键字采集...') goods_id_list = self._get_1688_goods_keywords_goods_id_list( keyword=keyword) elif type == 3: self.my_lg.info('下面是天猫的关键字采集...') goods_id_list = self._get_tmall_goods_keywords_goods_id_list( keyword=keyword) elif type == 4: self.my_lg.info('下面是京东的关键字采集...') goods_id_list = self._get_jd_goods_keywords_goods_id_list( keyword=keyword) else: goods_id_list = [] return goods_id_list def _deal_with_goods_id_list(self, **kwargs): ''' 分类执行代码 :param kwargs: :return: ''' type = kwargs.get('type', '') goods_id_list = kwargs.get('goods_id_list', []) keyword_id = kwargs.get('keyword_id', '') if type == 1: self._taobao_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) elif type == 2: self._1688_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) elif type == 3: self._tmall_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) elif type == 4: self._jd_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) else: pass return None def _get_taobao_goods_keywords_goods_id_list(self, keyword): ''' 获取该keywords的商品的goods_id_list :param keyword: (id, keyword) :return: a list ''' headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', # 'referer': 'https://s.taobao.com/search?q=%E8%BF%9E%E8%A1%A3%E8%A3%99%E5%A4%8F&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306', 'authority': 's.taobao.com', # 'cookie': 't=70c4fb481898a67a66d437321f7b5cdf; cna=nbRZExTgqWsCAXPCa6QA5B86; l=AkFBuFEM2rj4GbU8Mjl3KsFo0YZa/7Vg; thw=cn; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _cc_=UIHiLt3xSw%3D%3D; tg=0; enc=OFbfiyN19GGi1GicxsjVmrZoFzlt9plbuviK5OuthXYfocqTD%2BL079G%2BIt4OMg6ZrbV4veSg5SQEpzuMUgLe0w%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; miid=763730917900964122; mt=ci%3D-1_1; linezing_session=i72FGC0gr3GTls7K7lswxen2_1527664168714VAPN_1; cookie2=1cf9585e0c6d98c72c64beac41a68107; v=0; _tb_token_=5ee03e566b165; uc1=cookie14=UoTeOZOVOtrsVw%3D%3D; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _m_h5_tk=14984d833a4647c13d4207c86d0dbd97_1528036508423; _m_h5_tk_enc=a8709d79a833625dc5c42b778ee7f1ee; JSESSIONID=F57610F0B34140EDC9F242BEA0F4800A; isg=BLm5VsJ0xr4M-pvu-R_LcQkeyCNTbqwVe7qvs9vvJODVYtj0JBZ5Sd704WaUEkWw', } # 获取到的为淘宝关键字搜索按销量排名 params = ( ('data-key', 'sort'), ('data-value', 'sale-desc'), ('ajax', 'true'), # ('_ksTS', '1528171408340_395'), ('callback', 'jsonp396'), ('q', keyword[1]), ('imgfile', ''), ('commend', 'all'), ('ssid', 's5-e'), ('search_type', 'item'), ('sourceId', 'tb.index'), # ('spm', 'a21bo.2017.201856-taobao-item.1'), ('ie', 'utf8'), # ('initiative_id', 'tbindexz_20170306'), ) s_url = 'https://s.taobao.com/search' body = MyRequests.get_url_body(url=s_url, headers=headers, params=params) if body == '': return [] else: try: data = re.compile('\((.*)\)').findall(body)[0] except IndexError: self.my_lg.error('re获取淘宝data时出错, 出错关键字为{0}'.format(keyword[1])) return [] data = json_2_dict(json_str=data, logger=self.my_lg) if data == {}: self.my_lg.error('获取到的淘宝搜索data为空dict! 出错关键字为{0}'.format( keyword[1])) return [] else: goods_id_list = data.get('mainInfo', {}).get( 'traceInfo', {}).get('traceData', {}).get('allNids', []) if goods_id_list is None or goods_id_list == []: self.my_lg.error( '获取淘宝搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) return [] else: return goods_id_list def _get_1688_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取1688销量靠前的商品信息 :param keyword: :return: a list eg: ['11111', ...] ''' '''方案1: 从m.1688.com搜索页面进行抓取, 只取第一页的销量排名靠前的商品''' headers = { 'authority': 'm.1688.com', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': get_random_pc_ua(), 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; ali_ab=113.215.180.118.1523857816418.4; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; _csrf_token=1528708263870; JSESSIONID=9L783sX92-8iXZBHLCgK4fJiFKG9-W66WeuQ-BRgo4; hng=CN%7Czh-CN%7CCNY%7C156; t=70c4fb481898a67a66d437321f7b5cdf; _tb_token_=5ee03e566b165; __cn_logon__=false; h_keys="aa#2018%u5973%u88c5t%u6064"; alicnweb=homeIdttS%3D38414563432175544705031886000168094537%7Ctouch_tb_at%3D1528767881872%7ChomeIdttSAction%3Dtrue; ctoken=YnzGSFi23yEECqVO988Gzealot; _m_h5_tk=1cdad4dba1f1502fb29f57b3f73f5610_1528770803659; _m_h5_tk_enc=64259ec4fe4c33bc4555166994ed7b4d; __cn_logon__.sig=i6UL1cVhdIpbPPA_02yGiEyKMeZR2hBfnaoYK1CcrF4; ali_apache_id=11.182.158.193.1528768195886.327406.1; XSRF-TOKEN=b84fcec8-8bdf-41a5-a5c1-f8d6bfc9f83e; _tmp_ck_0=IlQ2M6x9F5xTkEpGRay66FVl%2BBaIEY076xELE8UtaLcz%2BgR%2FJ2UZOfDeKILA7R2VgXEJ7VYCkEQjS1RcUCwfL%2Br8ZFi0vwyVwyNpQsD2QG0HaihwedkkF9Cp9Ww0Jr%2BZF4la9CTe0AY8d1E1lDF91tD7lMAKIGVSne3V95CfI8VzpiWJ415B1IA0cc9J6IpYzn0mT1xLYnXcBAkDq0gop74NaynWIxw%2BLqmnXr%2BYU2bkOyMxZOBVY9B%2Bb0FU82h3TC9HCM8dGLnK2kxlgR%2B5lyT%2BCCFhhIX%2FioEMtA0TvDpXvRSUKoDTQG%2FCeJiKfy3LxMXmcTs5TBuWkh31F8nDCpLf6%2FlYOGkqeV1WLJeYXVe3SBvZC2O2JcYBQaKHcesETe%2FwTJL1fyc%3D; ad_prefer="2018/06/12 10:18:21"; webp=1; isg=BJWVxP7WYsuzzEf8vnJ3nRJEpJdFFdP4_0ZTRxc4b4wzbrxg3ONSdf5sPHJY2WFc; ali-ss=eyJ1c2VySWQiOm51bGwsImxvZ2luSWQiOm51bGwsInNpZCI6bnVsbCwiZWNvZGUiOm51bGwsIm1lbWJlcklkIjpudWxsLCJzZWNyZXQiOiJ5V3I0UVJGelVSVGp4dWs4aUxPWGl4dDIiLCJfZXhwaXJlIjoxNTI4ODU3MDE5ODMzLCJfbWF4QWdlIjo4NjQwMDAwMH0=; ali-ss.sig=z0qrG8Cj9BhDL_CLwTzgBGcdjSOXtp6YLxgDdTQRcWE', } params = ( ('sortType', 'booked'), ('filtId', ''), ('keywords', keyword[1]), ('descendOrder', 'true'), ) url = 'https://m.1688.com/offer_search/-6161.html' body = MyRequests.get_url_body(url=url, headers=headers, params=params) # self.my_lg.info(str(body)) if body == '': return [] else: try: goods_id_list = Selector(text=body).css( 'div.list_group-item::attr("data-offer-id")').extract() # pprint(goods_id_list) except Exception as e: self.my_lg.exception(e) self.my_lg.error( '获取1688搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1])) goods_id_list = [] return goods_id_list def _get_tmall_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取tmall销量靠前的商品 :param keyword: :return: list eg: ['//detail.tmall.com/item.htm?id=566978017832&skuId=3606684772412', ...] 不是返回goods_id ''' '''方案: tmall m站的搜索''' # 搜索: 偶尔不稳定但是还是能用 headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', # 'referer': 'https://list.tmall.com/search_product.htm?q=%B0%A2%B5%CF%B4%EF%CB%B9&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=d', 'authority': 'list.tmall.com', # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; _med=dw:1280&dh:800&pw:2560&ph:1600&ist:0; cq=ccp%3D1; hng=CN%7Czh-CN%7CCNY%7C156; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; enc=zIc9Cy5z0iS95tACxeX82fUsJdrekjC6%2BomP3kNKji1Z9RKwOt%2Fysyyewwf8twcytUGt2yT9AlAh5ASUlds05g%3D%3D; t=70c4fb481898a67a66d437321f7b5cdf; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _tb_token_=5ee03e566b165; cookie2=1cf9585e0c6d98c72c64beac41a68107; tt=tmall-main; pnm_cku822=098%23E1hvHpvUvbpvUvCkvvvvvjiPPFcvsjYnn2dvljEUPmP9sj1HPFsWtj3EP25ptj3PiQhvCvvvpZptvpvhvvCvpvhCvvOv9hCvvvmtvpvIvvCvxQvvvUgvvhVXvvvCxvvvBZZvvUhpvvChiQvv9Opvvho5vvmC3UyCvvOCvhEC0nkivpvUvvCCEppK6NOEvpCWvKXQwCzE%2BFuTRogRD76fdigqb64B9C97%2Bul1B5c6%2Bu0OVC61D70O58TJOymQD40OeutYon29V3Q7%2B3%2Busj7J%2Bu0OaokQD40OeutYLpGCvvpvvPMM; res=scroll%3A990*6982-client%3A472*680-offset%3A472*6982-screen%3A1280*800; _m_h5_tk=69794695b8eeb690d3ef037f6780d514_1529036786907; _m_h5_tk_enc=3e31314740c37d1fb14a26989cdac03c; isg=BN_f5lvy-LULYv0VwEkGMp59bjVjxpc1-mcB0nEsew7VAP6CeRTDNl2Gx5Z-nAte', } params = { 'page_size': '20', 'page_no': '1', 'q': str(keyword[1]), 'type': 'p', 'spm': 'a220m.6910245.a2227oh.d100', 'from': 'mallfp..m_1_suggest', 'sort': 'd', } s_url = 'https://list.tmall.com/m/search_items.htm' body = MyRequests.get_url_body(url=s_url, headers=headers, params=params) # self.my_lg.info(str(body)) if body == '': return [] else: data = json_2_dict(json_str=body, logger=self.my_lg) if data == {}: self.my_lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format( keyword[1])) return [] else: _ = data.get('item', []) if _ is None or _ == []: self.my_lg.error( '获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) return [] try: goods_id_list = [str(item.get('url', '')) for item in _] except Exception as e: self.my_lg.exception(e) self.my_lg.error( '获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) return [] return goods_id_list def _get_jd_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取京东销量靠前的商品 :param keyword: :return: [] or ['xxxx', ....] ''' # 方案1: jd m站的搜索(基于搜索接口) headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', # 'referer': 'https://so.m.jd.com/ware/search.action?keyword=b&area_ids=1,72,2819&sort_type=sort_totalsales15_desc&qp_disable=no&fdesc=%E5%8C%97%E4%BA%AC&t1=1529934870416', 'authority': 'so.m.jd.com', # 'cookie': '3AB9D23F7A4B3C9B=SL4YPRE3Y4C627UCHFP4ROHI54TTYYJKLFSVROZQ57T7K3OUUKSYIVFUJKQHBAUPRANZOTPLCVC2TICTSJG6WEMUII; mba_muid=1523868445027-16c30fbc5f8c54c429; abtest=20180416164812814_35; visitkey=41587293677961039; shshshfpa=9e159581-c64f-e9f4-ad0c-8b6ced0d9f28-1525907842; shshshfpb=1a725fe3148b84c839f009c93fc261f2218f59c61e7f4e6c05af381826; retina=1; webp=1; TrackerID=GGwYSka4RvH3lm0ZwLoO2_qdMpBwRG39BvyBvQaJfzyN5cmdGt4lEMSqqJS-sbDqj4nAUX2HU4sVDGA8vl169D37w4EqceYcH6ysXv46kMVfvVdAPmSMV9LceeO3Cc6Z; whwswswws=; __jdc=122270672; subAbTest=20180604104024339_59; mobilev=html5; m_uuid_new=05C2D24B7D8FFDA8D4243A929A5C6234; intlIpLbsCountrySite=jd; mhome=1; cid=9; M_Identification=3721cafc2442fba2_42b6f64bb933019fdb27c9e124cfd67f; M_Identification_abtest=20180604104040270_32361722; M_Identification=3721cafc2442fba2_42b6f64bb933019fdb27c9e124cfd67f; so_eggsCount=1; warehistory="4764260,10658784927,"; wq_logid=1528080290.1936376147; __jdu=15238681432201722645210; __jda=122270672.15238681432201722645210.1523868143.1528255502.1529934182.18; __jdv=122270672|direct|-|none|-|1529934182053; cn=0; user-key=ecfc3673-cc54-43e2-96bd-fb7a7e700c32; ipLoc-djd=1-72-2799-0; shshshfp=a3b9323dfc6a675230170e6a43efcb81; USER_FLAG_CHECK=d9f73823a80c0305366f70a3b99b9ecb; sid=57ea016fe0ab4b04271e00f01d94d3b9; intlIpLbsCountryIp=60.177.32.78; autoOpenApp_downCloseDate_auto=1529934572240_21600000; wxa_level=1; PPRD_P=UUID.15238681432201722645210; sc_width=1280; wq_area=15_1213_0%7C3; __jdb=122270672.10.15238681432201722645210|18.1529934182; mba_sid=15299345705167145512031951538.7; __wga=1529934993217.1529934585585.1528080039013.1526716673573.6.3; shshshsID=7f3d94fa215b4e53b467f0d5e0563e9c_9_1529934993592', } params = ( ('keyword', keyword[1]), ('datatype', '1'), ('callback', 'jdSearchResultBkCbA'), ('page', '1'), ('pagesize', '10'), ('ext_attr', 'no'), ('brand_col', 'no'), ('price_col', 'no'), ('color_col', 'no'), ('size_col', 'no'), ('ext_attr_sort', 'no'), ('merge_sku', 'yes'), ('multi_suppliers', 'yes'), ('area_ids', '1,72,2819'), ('sort_type', 'sort_totalsales15_desc'), ('qp_disable', 'no'), ('fdesc', '\u5317\u4EAC'), # ('t1', '1529934992189'), ) s_url = 'https://so.m.jd.com/ware/search._m2wq_list' body = MyRequests.get_url_body(url=s_url, headers=headers, params=params) # self.my_lg.info(str(body)) if body == '': return [] else: try: data = re.compile('jdSearchResultBkCbA\((.*)\)').findall( body)[0] except IndexError: self.my_lg.error('获取jd的关键字数据时, IndexError! 出错关键字为{0}'.format( (keyword[1]))) return [] '''问题在于编码中是\xa0之类的,当遇到有些 不用转义的\http之类的,则会出现以上错误。''' data = deal_with_JSONDecodeError_about_value_invalid_escape( json_str=data) data = json_2_dict(json_str=data, logger=self.my_lg) if data == {}: self.my_lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format( keyword[1])) return [] else: # 注意拿到的数据如果是京东拼购则跳过 # pprint(data) data = data.get('data', {}).get('searchm', {}).get('Paragraph', []) # pingou中字段'bp'不为空即为拼购商品,抓取时不抓取拼购商品, 即'pingou_price': item.get('pinGou', {}).get('bp', '') == '' if data is not None and data != []: goods_id_list = [ item.get('wareid', '') for item in data if item.get('pinGou', {}).get('bp', '') == '' ] return goods_id_list else: self.my_lg.error('获取到的data为空list, 请检查!') return [] def _taobao_keywords_spider(self, **kwargs): ''' 抓取goods_id_list的数据,并存储 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = [ 'https://item.taobao.com/item.htm?id=' + item for item in goods_id_list ] self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url result = False # 用于判断某个goods是否被插入的参数 try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.my_lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: taobao = TaoBaoLoginAndParse(logger=self.my_lg) if self.add_goods_index % 20 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = taobao.get_goods_id_from_url(item) if goods_id == '': self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(self.add_goods_index))) tt = taobao.get_goods_data(goods_id) data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) data['username'] = '******' data['main_goods_id'] = None # print('------>>>| 爬取到的数据为: ', data) result = taobao.old_taobao_goods_insert_into_new_table( data, pipeline=self.my_pipeline) else: pass else: # 表示返回的data值为空值 self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.my_lg.info('该关键字的商品已经抓取完毕!') return True def _1688_keywords_spider(self, **kwargs): ''' 1688对应关键字的商品信息抓取存储 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = [ 'https://detail.1688.com/offer/{0}.html'.format(item) for item in goods_id_list ] self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: result = False # 每次重置 try: goods_id = re.compile('offer/(.*?).html').findall(item)[0] except IndexError: self.my_lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: ali_1688 = ALi1688LoginAndParse(logger=self.my_lg) if self.add_goods_index % 20 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = ali_1688.get_goods_id_from_url(item) if goods_id == '': self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(self.add_goods_index))) tt = ali_1688.get_ali_1688_data(goods_id) if tt.get('is_delete') == 1 and tt.get( 'before') is False: # 处理已下架的但是还是要插入的 # 下架的商品就pass continue data = ali_1688.deal_with_data() if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html' data['username'] = '******' data['main_goods_id'] = None result = ali_1688.old_ali_1688_goods_insert_into_new_table( data=data, pipeline=self.my_pipeline) else: pass else: # 表示返回的data值为空值 self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 try: del ali_1688 except: pass gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.my_lg.info('该关键字的商品已经抓取完毕!') return True def _tmall_keywords_spider(self, **kwargs): ''' tmall对应关键字采集 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = [ 'https:' + re.compile('&skuId=.*').sub('', item) for item in goods_id_list ] self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url result = False # 用于判断某个goods是否被插入的参数 try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.my_lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: tmall = TmallParse(logger=self.my_lg) if self.add_goods_index % 20 == 0: # 每20次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = tmall.get_goods_id_from_url(item) if goods_id == []: self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = tmall.get_goods_data(goods_id) data = tmall.deal_with_data() goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data[ 'goods_url'] = tmall._from_tmall_type_get_tmall_url( type=data['type'], goods_id=goods_id) if data['goods_url'] == '': self.my_lg.error('该goods_url为空值! 此处跳过!') continue result = tmall.old_tmall_goods_insert_into_new_table( data, pipeline=self.my_pipeline) else: pass else: self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.my_lg.info('该关键字的商品已经抓取完毕!') return True def _jd_keywords_spider(self, **kwargs): ''' jd对应关键字采集 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') '''初始地址可以直接用这个[https://item.jd.com/xxxxx.html]因为jd会给你重定向到正确地址, 存也可以存这个地址''' # 所以这边jd就不分类存,一律存为常规商品site_id = 7 goods_url_list = [ 'https://item.jd.com/{0}.html'.format(str(item)) for item in goods_id_list ] self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url result = False # 用于判断某个goods是否被插入db的参数 try: goods_id = re.compile('\/(\d+)\.html').findall(item)[0] except IndexError: self.my_lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: jd = JdParse(logger=self.my_lg) if self.add_goods_index % 20 == 0: # 每20次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = jd.get_goods_id_from_url(item) if goods_id == []: self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.my_lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = jd.get_goods_data(goods_id) data = jd.deal_with_data(goods_id) goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data['goods_url'] = item result = jd.old_jd_goods_insert_into_new_table( data, self.my_pipeline) else: pass else: self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 sleep(1) try: del jd except: pass gc.collect() if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.my_lg.info('该关键字的商品已经抓取完毕!') return True def _insert_into_goods_id_and_keyword_middle_table(self, **kwargs): ''' 数据插入goods_id_and_keyword_middle_table :param kwargs: :return: ''' goods_id = str(kwargs['goods_id']) keyword_id = int(kwargs['keyword_id']) # self.my_lg.info(goods_id) # self.my_lg.info(keyword_id) result = False '''先判断中间表goods_id_and_keyword_middle_table是否已新增该关键字的id''' # 注意非完整sql语句不用r'', 而直接'' try: _ = self.my_pipeline._select_table(sql_str=kw_select_str_3, params=(goods_id, )) _ = [i[0] for i in _] # pprint(_) except Exception: self.my_lg.error( '执行中间表goods_id_and_keyword_middle_table是否已新增该关键字的id的sql语句时出错, 跳过给商品加keyword_id' ) return result if keyword_id not in _: params = ( goods_id, keyword_id, ) self.my_lg.info( '------>>>| 正在插入keyword_id为{0}, goods_id为{1}'.format( params[1], params[0])) result = self.my_pipeline._insert_into_table_2( sql_str=self.add_keyword_id_for_goods_id_sql_str, params=params, logger=self.my_lg) return result def _add_keyword_2_db_from_excel_file(self): ''' 从excel插入新关键字到db :return: ''' excel_file_path = '/Users/afa/Desktop/2018-07-18-淘宝phone-top20万.xlsx' self.my_lg.info('正在读取{0}, 请耐心等待...'.format(excel_file_path)) try: excel_result = read_info_from_excel_file( excel_file_path=excel_file_path) except Exception: self.my_lg.error('遇到错误:', exc_info=True) return False self.my_lg.info('读取完毕!!') self.my_lg.info('正在读取db中原先的keyword...') db_keywords = self.my_pipeline._select_table(sql_str=kw_select_str_4) db_keywords = [i[0] for i in db_keywords] self.my_lg.info('db keywords 读取完毕!') for item in excel_result: keyword = item.get('关键词', None) if not keyword: continue if keyword in db_keywords: self.my_lg.info('该关键字{0}已经存在于db中...'.format(keyword)) continue self.my_lg.info('------>>>| 正在存储关键字 {0}'.format(keyword)) self.my_pipeline._insert_into_table_2(sql_str=kw_insert_str_2, params=(str(keyword), 0), logger=self.my_lg) self.my_lg.info('全部写入完毕!') return True def __del__(self): try: del self.my_lg del self.msg del self.my_pipeline except: pass try: del self.db_existed_goods_id_list except: pass gc.collect()
class RecommendGoodOps(AsyncCrawler): """荐好ops""" def __init__(self): AsyncCrawler.__init__( self, log_print=True, is_new_loop=False, log_save_path=MY_SPIDER_LOGS_PATH + '/荐好/ops/', ip_pool_type=IP_POOL_TYPE, ) self.request_num_retries = 6 self.article_type = 'zq' self.yx_username = input('请输入yx_username:'******'请输入yx_password:'******'yx_username: {}, yx_password: {}'.format( self.yx_username, self.yx_password)) self.publish_url = 'https://configadmin.yiuxiu.com/Business/Index' self.select_sql0 = 'SELECT unique_id FROM dbo.recommend_good_ops_article_id_duplicate_removal' self.insert_sql0 = 'INSERT INTO dbo.recommend_good_ops_article_id_duplicate_removal(unique_id, create_time) values(%s, %s)' self.min_article_id = 0 self.max_article_id = 0 self.driver_headless = True # 必须使用代理, yx限制ip频繁 self.driver_use_proxy = True # 荐好管理label self.recommend_good_label_css_selector = 'span.nav-label' # article_id 截取数 self.zq_intercept_num = 2 self.hk_intercept_num = 1 self.lfd_intercept_num = 1 self.gxg_intercept_num = 1 self.article_parser = None # 暂存好看视频list的dict self.hk_cache_dict = {} # 暂存lfd list的dict self.lfd_cache_dict = {} # 暂存gxg list的dict self.gxg_cache_dict = {} async def _fck_run(self): # 休眠5分钟, 避免频繁发! # sleep_time = 0. sleep_time = 60 * 5.5 self.db_article_id_list = await self.get_db_unique_id_list() assert self.db_article_id_list != [] self.lg.info('db_article_id_list_len: {}'.format( len(self.db_article_id_list))) while True: if get_shanghai_time().hour == 0: # 夜晚休眠 await async_sleep(60 * 60 * 4.) else: pass try: try: await async_wait_for( self.auto_publish_articles(), timeout=(self.zq_intercept_num + self.hk_intercept_num) * 2.5 * 60) except AsyncTimeoutError: raise PublishOneArticleFailException except ( ArticleTitleOverLongException, LoginFailException, ArticleTitleContainSensitiveWordsException, PublishOneArticleFailException, EnterTargetPageFailException, ): self.lg.error('遇到错误:', exc_info=True) continue except Exception: self.lg.error('遇到错误:', exc_info=True) self.lg.info('休眠{}s...'.format(sleep_time)) await async_sleep(sleep_time) async def get_db_unique_id_list(self) -> list: """ 获取db的unique_id_list :return: """ self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() if not self.sql_cli.is_connect_success: raise SqlServerConnectionException res = [] try: res = self.sql_cli._select_table( sql_str=self.select_sql0, logger=self.lg, ) except Exception: self.lg.error('遇到错误:', exc_info=True) res = [] if res is None else res return [item[0] for item in res] async def auto_publish_articles(self): """ 自动发布文章 :return: """ self.sql_cli = get_new_sql_cli(sql_cli=self.sql_cli) if not self.sql_cli.is_connect_success: raise SqlServerConnectionException else: pass if self.min_article_id == 0\ or self.max_article_id == 0: self.article_parser = ArticleParser(logger=self.lg) article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type=self.article_type, )) assert article_list != [] self.min_article_id, self.max_article_id = self.get_latest_max_and_min_artcile_id_from_article_list( article_list=article_list, ) self.lg.info('最新的min_article_id: {}, max_article_id: {}'.format( self.min_article_id, self.max_article_id, )) else: pass # 创建目标集合 # zq_article_list = [] # hk_article_list = [] # lfd_article_list = [] zq_article_list = self.get_zq_own_create_article_id_list( min_article_id=self.min_article_id, max_article_id=self.max_article_id, ) hk_article_list = self.get_hk_article_id_list() lfd_article_list = self.get_lfd_article_id_list() gxg_article_list = self.get_gxg_article_id_list() # 测试用 # article_id = '17300123' # article_list = [{ # 'uid': get_uuid3(target_str='{}::{}'.format('zq', article_id)), # 'article_type': 'zq', # 'article_id': article_id, # 'title': '未知', # 'article_url': 'https://focus.youth.cn/mobile/detail/id/{}#'.format(article_id), # }] # 文章在前的发布顺序, 视频在后(避免视频发过多) article_list = zq_article_list + hk_article_list + lfd_article_list + gxg_article_list assert article_list != [] # pprint(article_list) target_article_list = self.get_target_article_list( article_list=article_list) if target_article_list == []: self.lg.info('待发布的target_article_list为空list, pass!') return # rasp上代理模式启动chromedriver具有一定的失败率, 故还是mac driver = BaseDriver( type=CHROME, executable_path=CHROME_DRIVER_PATH, # 本地老是出错 # type=FIREFOX, # executable_path=FIREFOX_DRIVER_PATH, load_images=True, logger=self.lg, headless=self.driver_headless, driver_use_proxy=self.driver_use_proxy, ip_pool_type=self.ip_pool_type, ) try: try: self.login_bg(driver=driver) self.get_into_recommend_good_manage(driver=driver) except FZTimeoutError: raise LoginFailException for item in target_article_list: uid = item.get('uid', '') title = item.get('title', '') article_url = item.get('article_url', '') self.lg.info('正在发布文章 title: {}, article_url: {} ...'.format( title, article_url)) try: self.publish_one_article( driver=driver, article_url=article_url, ) except FZTimeoutError: raise PublishOneArticleFailException # 新增, 以及插入db self.db_article_id_list.append(uid) self.sql_cli._insert_into_table_2( sql_str=self.insert_sql0, params=( uid, get_shanghai_time(), ), logger=self.lg, ) except ( ArticleTitleOverLongException, LoginFailException, ArticleTitleContainSensitiveWordsException, PublishOneArticleFailException, EnterTargetPageFailException, ) as e: # 抛出异常 raise e except Exception: self.lg.error('遇到错误:', exc_info=True) finally: try: del driver except: try: del driver except: pass return def get_gxg_article_id_list(self): """ 获取gxg目标article_id_list :return: """ if not isinstance(self.article_parser, ArticleParser): self.article_parser = ArticleParser(logger=self.lg) else: pass if self.gxg_cache_dict == {}: # 首次启动 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='gxg', )) self.gxg_cache_dict['data'] = article_list self.gxg_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: cache_time = self.gxg_cache_dict['cache_time'] if datetime_to_timestamp( get_shanghai_time()) - cache_time > 30 * 60: # gxg 每日更新数量有限, 每过30分钟重新获取一次 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='gxg', )) self.gxg_cache_dict['data'] = article_list self.gxg_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: article_list = self.gxg_cache_dict['data'] # 截取1个(与图文穿插) article_list = random_sample(article_list, self.gxg_intercept_num) return article_list def get_lfd_article_id_list(self): """ 获取lfd目标article_id_list :return: """ if not isinstance(self.article_parser, ArticleParser): self.article_parser = ArticleParser(logger=self.lg) else: pass if self.lfd_cache_dict == {}: # 首次启动 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='lfd', )) self.lfd_cache_dict['data'] = article_list self.lfd_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: cache_time = self.lfd_cache_dict['cache_time'] if datetime_to_timestamp( get_shanghai_time()) - cache_time > 30 * 60: # lfd 每日更新数量有限, 每过30分钟重新获取一次 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='lfd', )) self.lfd_cache_dict['data'] = article_list self.lfd_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: article_list = self.lfd_cache_dict['data'] # 截取1个(与图文穿插) article_list = random_sample(article_list, self.lfd_intercept_num) return article_list def get_hk_article_id_list(self): """ 获取hk 目标article_id_list :return: """ if not isinstance(self.article_parser, ArticleParser): self.article_parser = ArticleParser(logger=self.lg) else: pass if self.hk_cache_dict == {}: # 首次启动 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='hk', )) self.hk_cache_dict['data'] = article_list self.hk_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: cache_time = self.hk_cache_dict['cache_time'] if datetime_to_timestamp( get_shanghai_time()) - cache_time > 12 * 60: # 每过12分钟重新获取一次 article_list = self.loop.run_until_complete( self.article_parser.get_article_list_by_article_type( article_type='hk', )) self.hk_cache_dict['data'] = article_list self.hk_cache_dict['cache_time'] = datetime_to_timestamp( get_shanghai_time()) else: article_list = self.hk_cache_dict['data'] # 截取1个(与图文穿插) article_list = random_sample(article_list, self.hk_intercept_num) return article_list def get_latest_max_and_min_artcile_id_from_article_list( self, article_list) -> tuple: """ 获取最新范围的article_id最大, 最小的article_id(目的动态的自己创建值) :return: (int, int) """ latest_article_id_list = [] for item in article_list: # eg: zq是'17296475' article_id = item.get('article_id', '') if len(article_id) >= 8: latest_article_id_list.append(int(article_id)) else: continue assert latest_article_id_list != [] latest_article_id_list = sorted(latest_article_id_list) # pprint(latest_article_id_list) return (latest_article_id_list[0], latest_article_id_list[-1]) def get_zq_own_create_article_id_list(self, min_article_id: int, max_article_id: int): """ 自己create的article_id_list :return: """ # 取中间值, 避免老是在发老新闻 middle_article_id = int((min_article_id + max_article_id) / 2) self.lg.info('middle_article_id: {}'.format(middle_article_id)) article_id_list = [ str(article_id) for article_id in range(middle_article_id, max_article_id) ] # 截取3 article_id_list = random_sample(article_id_list, self.zq_intercept_num) res = [{ 'uid': get_uuid3(target_str='{}::{}'.format('zq', article_id)), 'article_type': 'zq', 'title': '未知', 'article_id': article_id, 'article_url': 'https://focus.youth.cn/mobile/detail/id/{}#'.format(article_id), } for article_id in article_id_list] new_res = res # 本地不检测了 # article_parser = ArticleParser(logger=self.lg) # # article_list = self.loop.run_until_complete(article_parser.get_article_list_by_article_type( # # article_type=self.article_type,)) # new_res = [] # for item in res: # article_url = item.get('article_url', '') # try: # self.lg.info('本地检测url: {}'.format(article_url)) # _ = self.loop.run_until_complete(article_parser._parse_article( # article_url=article_url,)) # title = _.get('title', '') # assert title != '' # # 标题必须小于等于30 # assert len(title) <= 30 # except Exception: # continue # # item.update({ # 'title': title, # }) # new_res.append(item) return new_res def get_target_article_list(self, article_list: list) -> list: """ 获取未被发布的item :return: """ target_article_list = [] for item in article_list: try: title = item.get('title', '') assert title != '' uid = item.get('uid', '') assert uid != '' article_url = item.get('article_url', '') assert article_url != '' if uid not in self.db_article_id_list: target_article_list.append(item) else: # 已发布的跳过 self.lg.info('该文章之前已被发布![where title: {}, url: {}]'.format( title, article_url)) continue except Exception: self.lg.error('遇到错误:', exc_info=True) continue return target_article_list @fz_set_timeout(seconds=1.5 * 60) def login_bg(self, driver: BaseDriver): """ login :return: """ self.lg.info('login ...') body = driver.get_url_body( url=self.publish_url, timeout=30, ) try: assert body != '' driver.find_element(value='input#loginName').send_keys( self.yx_username) driver.find_element(value='input#loginPwd').send_keys( self.yx_password) driver.find_element(value='button#subbut').click() except ( NoSuchElementException, SeleniumTimeoutException, AssertionError, WebDriverException, AttributeError, ): # 抛出登录异常 raise LoginFailException try: self.wait_for_recommend_good_label_appear(driver=driver) except FZTimeoutError: # 进入目标页失败, 则抛出异常! raise EnterTargetPageFailException @fz_set_timeout(seconds=10.) def wait_for_recommend_good_label_appear(self, driver: BaseDriver): """ 直到出现荐好管理label :param driver: :return: """ while True: recommend_good_label_text = driver.find_element( value=self.recommend_good_label_css_selector).text # self.lg.info('recommend_good_label_text: {}'.format(recommend_good_label_text)) if recommend_good_label_text == '荐好管理': break else: continue self.lg.info('login success!') @fz_set_timeout(seconds=60.) def get_into_recommend_good_manage(self, driver: BaseDriver): """ 进入荐好管理 :param driver: :return: """ try: driver.find_element( value=self.recommend_good_label_css_selector).click() # 等待下方标签出现 sleep(.5) driver.find_element(value='a.J_menuItem').click() except SeleniumTimeoutException: # 进入目标页失败, 则抛出异常! raise EnterTargetPageFailException @fz_set_timeout(seconds=2. * 60) def publish_one_article(self, driver: BaseDriver, article_url: str): """ 发布一篇图文 :param driver: :param article_url: :return: """ try: # 切换到目标iframe(用index有时候不准, pass) # driver.switch_to_frame(frame_reference=1) iframe_ele_list = driver.find_elements(by=By.TAG_NAME, value='iframe') # pprint(iframe_ele_list) assert iframe_ele_list != [] target_iframe_ele = iframe_ele_list[1] if len( iframe_ele_list) > 1 else iframe_ele_list[0] driver.switch_to_frame(frame_reference=target_iframe_ele) except (NoSuchFrameException, ) as e: # 没匹配到frame(可能是原先就在目标iframe, eg: title过长的, 再切回iframe, 但是iframe_ele_list为0) raise e # 清空输入框 input_box_ele = driver.find_element(value='input#SnatchUrl') input_box_ele.clear() # 输入待采集地址 input_box_ele.send_keys(article_url) # 点击采集按钮 driver.find_elements(value='span.input-group-btn button')[0].click() try: self.wait_for_delete_img_appear(driver=driver) except (FZTimeoutError, NoSuchElementException): # 发布某文章超时失败or无元素存在, 则抛出发布异常 raise PublishOneArticleFailException # 获取输入框的值 title = driver.find_element( value='input#RecommendName').get_attribute('value') self.lg.info('title: {}'.format(title)) if target_str_contain_some_char_check( target_str=title, check_char_obj=ARTICLE_TITLE_SENSITIVE_STR_TUPLE): raise ArticleTitleContainSensitiveWordsException else: pass if isinstance(title, str) and len(title) > 30: # 标题过长则return, 不发布 self.lg.info('@@@ title 标题过长, 无法发布!! 跳过!') # 由于标题过长后, 无法处理后续文章, 故不return, 直接抛出异常 # return raise ArticleTitleOverLongException else: pass try: # 点击发布按钮 driver.find_elements( value='span.input-group-btn button')[1].click() except WebDriverException: # 处理发布单篇异常! # 处理报错: Message: unknown error: Element <iframe class="J_iframe" name="iframe0" raise PublishOneArticleFailException # 切换至主页面 driver.switch_to_default_content() # 填写被发布人 random_phone = self.get_random_phone() driver.find_element( value='input.layui-layer-input').send_keys(random_phone) # 点击确定 driver.find_element(value='a.layui-layer-btn0').click() self.lg.info('url: {} 发布成功!'.format(article_url)) # 发布成功, 等待5.秒, 等待页面元素置空 sleep(5.) return @fz_set_timeout(seconds=60.) def wait_for_delete_img_appear(self, driver: BaseDriver): """ 直至出现图片, 超时退出(并且避免发布无图文章) :return: """ while True: delete_btn_text = driver.find_element(value='div.deletebut').text # self.lg.info('delete_btn_text: {}'.format(delete_btn_text)) if delete_btn_text == '删除': break else: continue self.lg.info('该url采集完毕!') def get_random_phone(self) -> int: """ 随机个手机号 :return: """ phone_list = [] with open('../tools/phone.txt', 'r') as f: for line in f: try: phone_list.append(int(line.replace('\n', ''))) except Exception: continue # pprint(phone_list) random_phone = phone_list[randint(0, len(phone_list) - 1)] self.lg.info('random_phone: {}'.format(random_phone)) return random_phone def __del__(self): try: del self.lg del self.loop del self.db_article_id_list del self.publish_url del self.article_parser del self.hk_cache_dict except: pass collect()
class XiaoHongShuParse(Crawler): def __init__(self, logger=None, by_wx=False): ''' :param logger: :param by_wx: 抓取wx小程序(弊端: 没有tags值 优点: 可长期采集, 不容易被封) √ vs 抓取app(弊端: 测试发现就算用高匿proxy每跑20个, 就被封3-5分钟, 效率低) ''' super(XiaoHongShuParse, self).__init__( ip_pool_type=IP_POOL_TYPE, log_print=True, logger=logger, log_save_path=MY_SPIDER_LOGS_PATH + '/小红书/_/', ) self._set_headers() self.by_wx = by_wx self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.index = 0 self.success_insert_db_num = 0 self.CRAWL_ARTICLE_SLEEP_TIME = 1 # 抓每天文章的sleep_time(wx=1/app=2) self.LONG_SLEEP_TIME = 0 # 每抓10条休眠时间 self.db_share_id = [] # db原先存在的 self.ip_pool_type = IP_POOL_TYPE def _set_headers(self): self.headers = { 'authority': 'www.xiaohongshu.com', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': get_random_pc_ua(), 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', } def _get_xiaohongshu_home_aritles_info(self): ''' 小红书主页json模拟获取(模拟app端主页请求) :return: ''' headers = { 'Accept-Encoding': 'br, gzip, deflate', 'Connection': 'keep-alive', # 'device_id': '2AEEF650-2CAE-480F-B30C-CA5CABC26193', 'Accept': 'application/json', 'Host': 'www.xiaohongshu.com', 'User-Agent': 'discover/5.19.1 (iPhone; iOS 11.0; Scale/3.00) Resolution/1242*2208 Version/5.19.1 Build/5191001 Device/(Apple Inc.;iPhone7,1)', # 'Authorization': 'session.1210427606534613282', 'Accept-Language': 'zh-Hans-CN;q=1, en-CN;q=0.9', 'X-Tingyun-Id': 'LbxHzUNcfig;c=2;r=551911068', } # 下面参数每个都是必须的, 且不变 params = ( ('deviceId', '2AEEF650-2CAE-480F-B30C-CA5CABC26193'), ('device_fingerprint', '201805101352429dd715d37f422fe3e64dd3923c0b0bc8017d90c099539039'), ('device_fingerprint1', '201805101352429dd715d37f422fe3e64dd3923c0b0bc8017d90c099539039'), ('lang', 'zh'), ('num', '10'), ('oid', 'homefeed_recommend'), ('platform', 'iOS'), ('sid', 'session.1210427606534613282'), ('sign', 'c9a9eadc6c46823ae3075d7b28fe97fa'), ('t', '1531010946'), # 用原来的避免sign错误 # ('t', int(time.time())), ) url = 'https://www.xiaohongshu.com/api/sns/v6/homefeed' body = Requests.get_url_body(url=url, headers=headers, params=params, cookies=None, high_conceal=True, ip_pool_type=self.ip_pool_type) # self.lg.info(body) if body == '': self.lg.error('获取到的body为空值!请检查!') return [] if re.compile(r'<title>403 Forbidden</title>').findall(body) != []: self.lg.info('此次抓取被403禁止!') sleep(self.CRAWL_ARTICLE_SLEEP_TIME) return [] _ = json_2_dict(body, logger=self.lg).get('data', []) # pprint(_) if _ == []: self.lg.error('获取到的data为空值!请检查!') return [] _ = [{ 'share_link': item.get('share_link', ''), 'likes': item.get('likes', 0), } for item in _] return _ def _deal_with_home_article(self): home_articles_link_list = self._get_xiaohongshu_home_aritles_info() # pprint(home_articles_link_list) self.lg.info(home_articles_link_list) # self.lg.info(str(home_articles_link_list) + '\n') data = self._deal_with_articles(articles_list=home_articles_link_list) # pprint(data) self._save_articles(data=data) self.lg.info('一次采集完毕, 进入{0}s休眠...'.format(self.LONG_SLEEP_TIME)) sleep(self.LONG_SLEEP_TIME) # 设置休眠, 实现周期抓取, 避免频繁抓取被封禁(测试发现抓20个就会封一会) return True def _deal_with_articles(self, articles_list): ''' 处理给与小红书地址(articles_list) :param articles_list: 待抓取的文章地址list eg: [{'share_link':'小红书地址', 'likes': 111}, ...] # likes可以为空 :return: data a list ''' data = [] _db = self.my_pipeline._select_table( sql_str='select share_id from dbo.daren_recommend') if _db is not None and _db != [] and _db != [()]: self.db_share_id = [item[0] for item in _db] # self.lg.info(self.db_share_id) for item in articles_list: self.index += 1 article_link = item.get('share_link', '') article_likes = item.get('likes', 0) article_id = re.compile(r'/item/(\w+)').findall(article_link)[0] if article_id in self.db_share_id: self.lg.info('该{0}已存在于db中...跳过!'.format(article_id)) self.lg.info('[+] {0}'.format(article_link)) if article_link != '': if not self.by_wx: # 通过pc端 params = (('_at', '499a292d16aed3d80a068fc60e0c1e3ee3410'), ) body = Requests.get_url_body( url=article_link, headers=self.headers, params=params, high_conceal=True, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) try: article_info = re.compile( 'window.__INITIAL_SSR_STATE__=(.*?)</script>' ).findall(body)[0] # self.lg.info(str(article_info)) except IndexError: self.lg.error('获取article_info时IndexError!请检查!') sleep(self.CRAWL_ARTICLE_SLEEP_TIME) continue article_info = self._wash_article_info( json_2_dict(json_str=article_info, logger=self.lg)) # pprint(article_info) article_info = self._parse_page( article_link=article_link, article_info=article_info, article_likes=article_likes) # pprint(article_info) else: # 通过wx小程序 # url = "https://www.xiaohongshu.com/wx_mp_api/sns/v1/note/" + article_id # wx接口改版, 需要一个参数Auth认证, 暂时没处理 url = 'https://www.xiaohongshu.com/sapi/wx_mp_api/sns/v1/note/' + article_id params = { "sid": "session.1210427606534613282", # 对方服务器用来判断登录是否过期(过期则替换这个即可再次采集) } body = Requests.get_url_body( url=url, headers=self.headers, params=params, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) if body == '': self.lg.error('获取到的article的body为空值!跳过!') sleep(self.CRAWL_ARTICLE_SLEEP_TIME) continue article_info = self._wash_article_info_from_wx( json_2_dict(json_str=body, logger=self.lg)) article_info = self._parse_page_from_wx( article_link=article_link, article_info=article_info, article_likes=article_likes) # pprint(article_info) data.append(article_info) sleep(self.CRAWL_ARTICLE_SLEEP_TIME) else: pass self.lg.info('@@@ 抓取完毕!') # pprint(data) return data def _parse_page(self, **kwargs): ''' 解析单个article的info :return: a dict ''' article_link = kwargs.get('article_link', '') article_info = kwargs.get('article_info', {}).get('NoteView', {}) article_likes = kwargs.get('article_likes', get_random_int_number()) error_msg = '出错article_url: {0}'.format(article_link) try: nick_name = article_info.get('noteInfo', {}).get('user', {}).get('nickname', '') assert nick_name != '', '获取到的nick_name为空值!请检查!' + error_msg head_url = article_info.get('noteInfo', {}).get('user', {}).get('image', '') assert head_url != '', '获取到的head_url为空值!请检查!' + error_msg profile = '' # 个人简介或者个性签名(留空) share_id = article_info.get('noteInfo', {}).get('id', '') assert share_id != '', '获取到的share_id为空值!请检查!' + error_msg title = article_info.get('noteInfo', {}).get('title', '') # title默认留空 comment_content = self.wash_sensitive_info( article_info.get('noteInfo', {}).get('desc', '')) assert comment_content != '', '获取到的comment_content为空!请检查!' + error_msg share_img_url_list = [{ # 如果是视频的话, 则里面第一章图片就是视频第一帧 'img_url': item.get('original', ''), 'height': item.get('height'), # 图片高宽 'width': item.get('width'), } for item in article_info.get('noteInfo', {}).get('images', [])] assert share_img_url_list != [], '获取到的share_img_url_list为空list!请检查!' + error_msg div_body = '' # 默认留空 gather_url = article_link # 原文章原始的创建日期 tmp_create_time = article_info.get('noteInfo', {}).get('time', '') assert tmp_create_time != '', '获取到的create_time为空值!请检查!' create_time = string_to_datetime(tmp_create_time + ':00') site_id = 3 # 小红书 goods_url_list = [] # 该文章待抓取的商品地址 share_goods_base_info = [] tags = self._get_tags(article_info=article_info) # 视频播放地址 tmp_video_url = article_info.get('noteInfo', {}).get('video', '') tmp_video_url = 'https:' + tmp_video_url if tmp_video_url != '' else '' video_url = re.compile(r'//sa.').sub(r'//v.', tmp_video_url) likes = article_likes collects = article_info.get('noteInfo', {}).get('collects', None) assert collects is not None, '获取到的collects为None!请检查!' + error_msg except Exception: sleep(self.CRAWL_ARTICLE_SLEEP_TIME) self.lg.error('遇到错误: ', exc_info=True) return {} _ = WellRecommendArticle() _['nick_name'] = nick_name _['head_url'] = head_url _['profile'] = profile _['share_id'] = share_id _['title'] = title _['comment_content'] = comment_content _['share_img_url_list'] = share_img_url_list _['div_body'] = div_body _['gather_url'] = gather_url _['create_time'] = create_time _['site_id'] = site_id _['goods_url_list'] = goods_url_list _['tags'] = tags _['share_goods_base_info'] = share_goods_base_info _['video_url'] = video_url _['likes'] = likes _['collects'] = collects return _ def _parse_page_from_wx(self, **kwargs): ''' 解析wx单个article的info :param kwargs: :return: a WellRecommendArticle object ''' article_link = kwargs.get('article_link', '') article_info = kwargs.get('article_info', {}).get('data', {}) article_likes = kwargs.get('article_likes', get_random_int_number()) error_msg = '出错article_url: {0}'.format(article_link) try: nick_name = article_info.get('user', {}).get('nickname', '') assert nick_name != '', '获取到的nick_name为空值!请检查!' + error_msg head_url = article_info.get('user', {}).get('images', '') assert head_url != '', '获取到的head_url为空值!请检查!' + error_msg profile = '' # 个人简介或者个性签名(留空) share_id = article_info.get('id', '') assert share_id != '', '获取到的share_id为空值!请检查!' + error_msg title = self.wash_sensitive_info(article_info.get('title', '')) # title默认留空 comment_content = self.wash_sensitive_info( article_info.get('desc', '')) assert comment_content != '', '获取到的comment_content为空!请检查!' + error_msg share_img_url_list = [{ # 如果是视频的话, 则里面第一章图片就是视频第一帧 'img_url': item.get('original', ''), 'height': item.get('height'), # 图片高宽 'width': item.get('width'), } for item in article_info.get('images_list', [])] assert share_img_url_list != [], '获取到的share_img_url_list为空list!请检查!' + error_msg div_body = '' # 默认留空 gather_url = article_link # 原文章原始的创建日期 tmp_create_time = article_info.get('time', '') assert tmp_create_time != '', '获取到的create_time为空值!请检查!' create_time = string_to_datetime(tmp_create_time + ':00') site_id = 3 # 小红书 goods_url_list = [] # 该文章待抓取的商品地址 share_goods_base_info = [] # wx端tags没有返回值 tags = self._get_tags_from_wx(article_info=article_info) # 视频播放地址 tmp_video_url = article_info.get('video', '') tmp_video_url = re.compile('\?.*').sub('', tmp_video_url) video_url = re.compile(r'//sa.').sub(r'//v.', tmp_video_url) likes = article_likes collects = article_info.get('fav_count', None) assert collects is not None, '获取到的collects为None!请检查!' + error_msg except Exception: sleep(self.CRAWL_ARTICLE_SLEEP_TIME) self.lg.error('遇到错误:', exc_info=True) return {} _ = WellRecommendArticle() _['nick_name'] = nick_name _['head_url'] = head_url _['profile'] = profile _['share_id'] = share_id _['title'] = title _['comment_content'] = comment_content _['share_img_url_list'] = share_img_url_list _['div_body'] = div_body _['gather_url'] = gather_url _['create_time'] = create_time _['site_id'] = site_id _['goods_url_list'] = goods_url_list _['tags'] = tags _['share_goods_base_info'] = share_goods_base_info _['video_url'] = video_url _['likes'] = likes _['collects'] = collects return _ def _save_articles(self, data): ''' 存储数据 :param data: :return: ''' self.lg.info('即将开始存储该文章...') sql_str = 'insert into dbo.daren_recommend(share_id, nick_name, head_url, profile, gather_url, title, comment_content, share_img_url_list, div_body, create_time, site_id, tags, video_url, likes, collects) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' for item in data: if self.index % 20 == 0: self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if self.my_pipeline.is_connect_success: share_id = item.get('share_id', '') if share_id == '': continue self.lg.info( '------>>>| 正在存储share_id: {0}...'.format(share_id)) try: params = self._get_db_insert_into_params(item=item) except Exception: continue result = self.my_pipeline._insert_into_table_2(sql_str=sql_str, params=params, logger=self.lg) if result: self.success_insert_db_num += 1 else: self.lg.error('db连接失败!存储失败! 出错article地址:{0}'.format( item.get('gather_url', ''))) self.lg.info('@' * 9 + ' 目前成功存储{0}个!'.format(self.success_insert_db_num)) return True def _get_db_insert_into_params(self, item): ''' 得到待存储的数据 :param item: :return: ''' params = [ item['share_id'], item['nick_name'], item['head_url'], item['profile'], item['gather_url'], item['title'], item['comment_content'], dumps(item['share_img_url_list'], ensure_ascii=False), # dumps(item['goods_id_list'], ensure_ascii=False), # dumps(item['share_goods_base_info'], ensure_ascii=False), item['div_body'], item['create_time'], item['site_id'], dumps(item['tags'], ensure_ascii=False), item['video_url'], item['likes'], item['collects'], ] return tuple(params) def _get_tags(self, article_info): ''' 获取tags :return: ''' tmp_tags = list_duplicate_remove([ str(item.get('name', '')) for item in article_info.get( 'noteInfo', {}).get('relatedTags', []) ]) # self.lg.info(str(tmp_tags)) # list先转str, 去掉敏感字眼, 再转list, 并去除''元素, 得到最后list tmp_tags = delete_list_null_str( self.wash_sensitive_info('|'.join(tmp_tags)).split('|')) tags = [ { # tags可以为空list! 'keyword': item, } for item in tmp_tags ] return tags def _get_tags_from_wx(self, article_info): ''' 从wx获取tags :param article_info: :return: ''' return [] def _wash_article_info(self, _dict): ''' 清洗无用字段 :param _dict: :return: ''' try: _dict['NoteView']['commentInfo'] = {} # 评论信息 _dict['NoteView']['panelData'] = [] # 相关笔记 except: pass return _dict def _wash_article_info_from_wx(self, _dict): ''' 清洗wx无用字段 :param _dict: :return: ''' try: _dict['data']['mini_program_info'] = {} # 推荐首页的缩略信息 _dict['data']['share_info'] = {} # 分享的信息 except: pass return _dict def wash_sensitive_info(self, data): ''' 清洗敏感信息 :param data: :return: ''' replace_str_list = [ ('小红书', '优秀网'), ('xiaohongshu', '优秀网'), ('XIAOHONGSHU', '优秀网'), ('某宝', '优秀网'), ('薯队长', '秀队长'), ('薯宝宝', '秀客'), ('红薯们', '秀客们'), ('小红薯', '小秀客'), ] add_sensitive_str_list = [ '#.*#', '@.*?薯', ] return wash_sensitive_info( data=data, replace_str_list=replace_str_list, add_sensitive_str_list=add_sensitive_str_list) def __del__(self): try: del self.lg del self.my_pipeline except: pass gc.collect()
class XiaoHongShuParse(object): def __init__(self, logger=None, by_wx=False): ''' :param logger: :param by_wx: 抓取wx小程序(弊端: 没有tags值 优点: 可长期采集, 不容易被封) √ vs 抓取app(弊端: 测试发现就算用高匿proxy每跑20个, 就被封3-5分钟, 效率低) ''' super(XiaoHongShuParse, self).__init__() self._set_logger(logger) self._set_headers() self.by_wx = by_wx self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.index = 0 self.success_insert_db_num = 0 self.CRAWL_ARTICLE_SLEEP_TIME = 1 # 抓每天文章的sleep_time(wx=1/app=2) self.LONG_SLEEP_TIME = 0 # 每抓10条休眠时间 self.db_share_id = [] # db原先存在的 def _set_headers(self): self.headers = { 'authority': 'www.xiaohongshu.com', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': get_random_pc_ua(), 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', # 'cookie': 'Hm_lvt_9df7d19786b04345ae62033bd17f6278=1530954715,1530954763,1530954908,1531113520; Hm_lvt_d0ae755ac51e3c5ff9b1596b0c09c826=1530954716,1530954763,1530954907,1531113520; Hm_lpvt_d0ae755ac51e3c5ff9b1596b0c09c826=1531119425; Hm_lpvt_9df7d19786b04345ae62033bd17f6278=1531119425; beaker.session.id=b8a1a4ca0c2293ec3d447c7edbdc9dc2c528b5f2gAJ9cQEoVQhfZXhwaXJlc3ECY2RhdGV0aW1lCmRhdGV0aW1lCnEDVQoH4gcOCQIMAD2ghVJxBFUDX2lkcQVVIDNmMmM5NmE1YjQzNDQyMjA5MDM5OTIyNjU4ZjE3NjIxcQZVDl9hY2Nlc3NlZF90aW1lcQdHQdbQwdBhhRtVDl9jcmVhdGlvbl90aW1lcQhHQdbQIEMRyBF1Lg==; xhs_spses.5dde=*; xhs_spid.5dde=af753270e27cdd3c.1530953997.5.1531119433.1531115989.18f4b29f-8212-42a2-8ad6-002c47ebdb65', } def _set_logger(self, logger): if logger is None: self.my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/小红书/_/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR ) else: self.my_lg = logger def _get_xiaohongshu_home_aritles_info(self): ''' 小红书主页json模拟获取(模拟app端主页请求) :return: ''' # cookies = { # 'beaker.session.id': '2ce91a013076367573e263a34e3691a510bb0479gAJ9cQEoVQhfZXhwaXJlc3ECY2RhdGV0aW1lCmRhdGV0aW1lCnEDVQoH4gcNDQoiDh9FhVJxBFULc2Vzc2lvbmlkLjFxBVgbAAAAc2Vzc2lvbi4xMjEwNDI3NjA2NTM0NjEzMjgycQZVCHVzZXJpZC4xcQdYGAAAADU4ZWRlZjc0NWU4N2U3NjBjOWMyNzAyNHEIVQNfaWRxCVUgMjMyNTRkOWU1MDUyNDY3NDkzZTMzZGM0YjE1MzUzZmZxClUOX2FjY2Vzc2VkX3RpbWVxC0dB1s/aksJmZlUOX2NyZWF0aW9uX3RpbWVxDEdB1s/akrUrE3Uu', # 'xhsTrackerId': '96359c99-a7b3-4725-c75d-2ee052cf2cc1', # 'xhs_spid.5dde': '9f350c095b58c416.1529844024.1.1529844045.1529844024.dfa500dd-18b6-4cf1-a094-3bc87addd183', # } headers = { 'Accept-Encoding': 'br, gzip, deflate', 'Connection': 'keep-alive', # 'device_id': '2AEEF650-2CAE-480F-B30C-CA5CABC26193', 'Accept': 'application/json', 'Host': 'www.xiaohongshu.com', 'User-Agent': 'discover/5.19.1 (iPhone; iOS 11.0; Scale/3.00) Resolution/1242*2208 Version/5.19.1 Build/5191001 Device/(Apple Inc.;iPhone7,1)', # 'Authorization': 'session.1210427606534613282', 'Accept-Language': 'zh-Hans-CN;q=1, en-CN;q=0.9', 'X-Tingyun-Id': 'LbxHzUNcfig;c=2;r=551911068', } # 下面参数每个都是必须的, 且不变 params = ( ('deviceId', '2AEEF650-2CAE-480F-B30C-CA5CABC26193'), ('device_fingerprint', '201805101352429dd715d37f422fe3e64dd3923c0b0bc8017d90c099539039'), ('device_fingerprint1', '201805101352429dd715d37f422fe3e64dd3923c0b0bc8017d90c099539039'), ('lang', 'zh'), ('num', '10'), ('oid', 'homefeed_recommend'), ('platform', 'iOS'), ('sid', 'session.1210427606534613282'), ('sign', 'c9a9eadc6c46823ae3075d7b28fe97fa'), ('t', '1531010946'), # 用原来的避免sign错误 # ('t', int(time.time())), ) url = 'https://www.xiaohongshu.com/api/sns/v6/homefeed' body = MyRequests.get_url_body(url=url, headers=headers, params=params, cookies=None, high_conceal=True) # self.my_lg.info(body) if body == '': self.my_lg.error('获取到的body为空值!请检查!') return [] if re.compile(r'<title>403 Forbidden</title>').findall(body) != []: self.my_lg.info('此次抓取被403禁止!') sleep(self.CRAWL_ARTICLE_SLEEP_TIME) return [] _ = json_2_dict(body, logger=self.my_lg).get('data', []) # pprint(_) if _ == []: self.my_lg.error('获取到的data为空值!请检查!') return [] _ = [{ 'share_link': item.get('share_link', ''), 'likes': item.get('likes', 0), } for item in _] return _ def _deal_with_home_article(self): home_articles_link_list = self._get_xiaohongshu_home_aritles_info() # pprint(home_articles_link_list) self.my_lg.info(home_articles_link_list) # self.my_lg.info(str(home_articles_link_list) + '\n') data = self._deal_with_articles(articles_list=home_articles_link_list) # pprint(data) self._save_articles(data=data) self.my_lg.info('一次采集完毕, 进入{0}s休眠...'.format(self.LONG_SLEEP_TIME)) sleep(self.LONG_SLEEP_TIME) # 设置休眠, 实现周期抓取, 避免频繁抓取被封禁(测试发现抓20个就会封一会) return True def _deal_with_articles(self, articles_list): ''' 处理给与小红书地址(articles_list) :param articles_list: 待抓取的文章地址list eg: [{'share_link':'小红书地址', 'likes': 111}, ...] # likes可以为空 :return: data a list ''' data = [] _db = self.my_pipeline._select_table(sql_str='select share_id from dbo.daren_recommend') if _db is not None and _db != [] and _db != [()]: self.db_share_id = [item[0] for item in _db] # self.my_lg.info(self.db_share_id) for item in articles_list: self.index += 1 article_link = item.get('share_link', '') article_likes = item.get('likes', 0) article_id = re.compile(r'/item/(\w+)').findall(article_link)[0] if article_id in self.db_share_id: self.my_lg.info('该{0}已存在于db中...跳过!'.format(article_id)) self.my_lg.info('[+] {0}'.format(article_link)) if article_link != '': if not self.by_wx: # 通过pc端 body = MyRequests.get_url_body(url=article_link, headers=self.headers, high_conceal=True) try: article_info = re.compile('window.__INITIAL_SSR_STATE__=(.*?)</script>').findall(body)[0] # self.my_lg.info(str(article_info)) except IndexError: self.my_lg.error('获取article_info时IndexError!请检查!') sleep(self.CRAWL_ARTICLE_SLEEP_TIME) continue article_info = self._wash_article_info(json_2_dict(json_str=article_info, logger=self.my_lg)) # pprint(article_info) article_info = self._parse_page( article_link=article_link, article_info=article_info, article_likes=article_likes) # pprint(article_info) else: # 通过wx小程序 url = "https://www.xiaohongshu.com/wx_mp_api/sns/v1/note/" + article_id params = { "sid": "session.1210427606534613282", # 对方服务器用来判断登录是否过期(过期则替换这个即可再次采集) } body = MyRequests.get_url_body(url=url, headers=self.headers, params=params) if body == '': self.my_lg.error('获取到的article的body为空值!跳过!') sleep(self.CRAWL_ARTICLE_SLEEP_TIME) continue article_info = self._wash_article_info_from_wx(json_2_dict(json_str=body, logger=self.my_lg)) article_info = self._parse_page_from_wx( article_link=article_link, article_info=article_info, article_likes=article_likes) # pprint(article_info) data.append(article_info) sleep(self.CRAWL_ARTICLE_SLEEP_TIME) else: pass self.my_lg.info('@@@ 抓取完毕!') # pprint(data) return data def _parse_page(self, **kwargs): ''' 解析单个article的info :return: a dict ''' article_link = kwargs.get('article_link', '') article_info = kwargs.get('article_info', {}).get('NoteView', {}) article_likes = kwargs.get('article_likes', get_random_int_number()) error_msg = '出错article_url: {0}'.format(article_link) try: nick_name = article_info.get('noteInfo', {}).get('user', {}).get('nickname', '') assert nick_name != '', '获取到的nick_name为空值!请检查!' + error_msg head_url = article_info.get('noteInfo', {}).get('user', {}).get('image', '') assert head_url != '', '获取到的head_url为空值!请检查!' + error_msg profile = '' # 个人简介或者个性签名(留空) share_id = article_info.get('noteInfo', {}).get('id', '') assert share_id != '', '获取到的share_id为空值!请检查!' + error_msg title = article_info.get('noteInfo', {}).get('title', '') # title默认留空 comment_content = self.wash_sensitive_info(article_info.get('noteInfo', {}).get('desc', '')) assert comment_content != '', '获取到的comment_content为空!请检查!' + error_msg share_img_url_list = [{ # 如果是视频的话, 则里面第一章图片就是视频第一帧 'img_url': item.get('original', ''), 'height': item.get('height'), # 图片高宽 'width': item.get('width'), } for item in article_info.get('noteInfo', {}).get('images', [])] assert share_img_url_list != [], '获取到的share_img_url_list为空list!请检查!' + error_msg div_body = '' # 默认留空 gather_url = article_link # 原文章原始的创建日期 tmp_create_time = article_info.get('noteInfo', {}).get('time', '') assert tmp_create_time != '', '获取到的create_time为空值!请检查!' create_time = string_to_datetime(tmp_create_time + ':00') site_id = 3 # 小红书 goods_url_list = [] # 该文章待抓取的商品地址 share_goods_base_info = [] tags = self._get_tags(article_info=article_info) # 视频播放地址 tmp_video_url = article_info.get('noteInfo', {}).get('video', '') tmp_video_url = 'https:' + tmp_video_url if tmp_video_url != '' else '' video_url = re.compile(r'//sa.').sub(r'//v.', tmp_video_url) likes = article_likes collects = article_info.get('noteInfo', {}).get('collects', None) assert collects is not None, '获取到的collects为None!请检查!' + error_msg except Exception: sleep(self.CRAWL_ARTICLE_SLEEP_TIME) self.my_lg.error('遇到错误: ', exc_info=True) return {} _ = WellRecommendArticle() _['nick_name'] = nick_name _['head_url'] = head_url _['profile'] = profile _['share_id'] = share_id _['title'] = title _['comment_content'] = comment_content _['share_img_url_list'] = share_img_url_list _['div_body'] = div_body _['gather_url'] = gather_url _['create_time'] = create_time _['site_id'] = site_id _['goods_url_list'] = goods_url_list _['tags'] = tags _['share_goods_base_info'] = share_goods_base_info _['video_url'] = video_url _['likes'] = likes _['collects'] = collects return _ def _parse_page_from_wx(self, **kwargs): ''' 解析wx单个article的info :param kwargs: :return: a WellRecommendArticle object ''' article_link = kwargs.get('article_link', '') article_info = kwargs.get('article_info', {}).get('data', {}) article_likes = kwargs.get('article_likes', get_random_int_number()) error_msg = '出错article_url: {0}'.format(article_link) try: nick_name = article_info.get('user', {}).get('nickname', '') assert nick_name != '', '获取到的nick_name为空值!请检查!' + error_msg head_url = article_info.get('user', {}).get('images', '') assert head_url != '', '获取到的head_url为空值!请检查!' + error_msg profile = '' # 个人简介或者个性签名(留空) share_id = article_info.get('id', '') assert share_id != '', '获取到的share_id为空值!请检查!' + error_msg title = self.wash_sensitive_info(article_info.get('title', '')) # title默认留空 comment_content = self.wash_sensitive_info(article_info.get('desc', '')) assert comment_content != '', '获取到的comment_content为空!请检查!' + error_msg share_img_url_list = [{ # 如果是视频的话, 则里面第一章图片就是视频第一帧 'img_url': item.get('original', ''), 'height': item.get('height'), # 图片高宽 'width': item.get('width'), } for item in article_info.get('images_list', [])] assert share_img_url_list != [], '获取到的share_img_url_list为空list!请检查!' + error_msg div_body = '' # 默认留空 gather_url = article_link # 原文章原始的创建日期 tmp_create_time = article_info.get('time', '') assert tmp_create_time != '', '获取到的create_time为空值!请检查!' create_time = string_to_datetime(tmp_create_time + ':00') site_id = 3 # 小红书 goods_url_list = [] # 该文章待抓取的商品地址 share_goods_base_info = [] # wx端tags没有返回值 tags = self._get_tags_from_wx(article_info=article_info) # 视频播放地址 tmp_video_url = article_info.get('video', '') tmp_video_url = re.compile('\?.*').sub('', tmp_video_url) video_url = re.compile(r'//sa.').sub(r'//v.', tmp_video_url) likes = article_likes collects = article_info.get('fav_count', None) assert collects is not None, '获取到的collects为None!请检查!' + error_msg except Exception: sleep(self.CRAWL_ARTICLE_SLEEP_TIME) self.my_lg.error('遇到错误:', exc_info=True) return {} _ = WellRecommendArticle() _['nick_name'] = nick_name _['head_url'] = head_url _['profile'] = profile _['share_id'] = share_id _['title'] = title _['comment_content'] = comment_content _['share_img_url_list'] = share_img_url_list _['div_body'] = div_body _['gather_url'] = gather_url _['create_time'] = create_time _['site_id'] = site_id _['goods_url_list'] = goods_url_list _['tags'] = tags _['share_goods_base_info'] = share_goods_base_info _['video_url'] = video_url _['likes'] = likes _['collects'] = collects return _ def _save_articles(self, data): ''' 存储数据 :param data: :return: ''' self.my_lg.info('即将开始存储该文章...') sql_str = 'insert into dbo.daren_recommend(share_id, nick_name, head_url, profile, gather_url, title, comment_content, share_img_url_list, div_body, create_time, site_id, tags, video_url, likes, collects) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' for item in data: if self.index % 20 == 0: self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() if self.my_pipeline.is_connect_success: share_id = item.get('share_id', '') if share_id == '': continue self.my_lg.info('------>>>| 正在存储share_id: {0}...'.format(share_id)) try: params = self._get_db_insert_into_params(item=item) except Exception: continue result = self.my_pipeline._insert_into_table_2(sql_str=sql_str, params=params, logger=self.my_lg) if result: self.success_insert_db_num += 1 else: self.my_lg.error('db连接失败!存储失败! 出错article地址:{0}'.format(item.get('gather_url', ''))) self.my_lg.info('@' * 9 + ' 目前成功存储{0}个!'.format(self.success_insert_db_num)) return True def _get_db_insert_into_params(self, item): ''' 得到待存储的数据 :param item: :return: ''' params = [ item['share_id'], item['nick_name'], item['head_url'], item['profile'], item['gather_url'], item['title'], item['comment_content'], dumps(item['share_img_url_list'], ensure_ascii=False), # dumps(item['goods_id_list'], ensure_ascii=False), # dumps(item['share_goods_base_info'], ensure_ascii=False), item['div_body'], item['create_time'], item['site_id'], dumps(item['tags'], ensure_ascii=False), item['video_url'], item['likes'], item['collects'], ] return tuple(params) def _get_tags(self, article_info): ''' 获取tags :return: ''' tmp_tags = list_duplicate_remove( [str(item.get('name', '')) for item in article_info.get('noteInfo', {}).get('relatedTags', [])]) # self.my_lg.info(str(tmp_tags)) # list先转str, 去掉敏感字眼, 再转list, 并去除''元素, 得到最后list tmp_tags = delete_list_null_str(self.wash_sensitive_info('|'.join(tmp_tags)).split('|')) tags = [{ # tags可以为空list! 'keyword': item, } for item in tmp_tags] return tags def _get_tags_from_wx(self, article_info): ''' 从wx获取tags :param article_info: :return: ''' return [] def _wash_article_info(self, _dict): ''' 清洗无用字段 :param _dict: :return: ''' try: _dict['NoteView']['commentInfo'] = {} # 评论信息 _dict['NoteView']['panelData'] = [] # 相关笔记 except: pass return _dict def _wash_article_info_from_wx(self, _dict): ''' 清洗wx无用字段 :param _dict: :return: ''' try: _dict['data']['mini_program_info'] = {} # 推荐首页的缩略信息 _dict['data']['share_info'] = {} # 分享的信息 except: pass return _dict def wash_sensitive_info(self, data): ''' 清洗敏感信息 :param data: :return: ''' replace_str_list = [ ('小红书', '优秀网'), ('xiaohongshu', '优秀网'), ('XIAOHONGSHU', '优秀网'), ('某宝', '优秀网'), ('薯队长', '秀队长'), ('薯宝宝', '秀客'), ('红薯们', '秀客们'), ('小红薯', '小秀客'), ] add_sensitive_str_list = [ '#.*#', '@.*?薯', ] return wash_sensitive_info( data=data, replace_str_list=replace_str_list, add_sensitive_str_list=add_sensitive_str_list ) def __del__(self): try: del self.my_lg del self.my_pipeline except: pass gc.collect()
class GoodsKeywordsSpider(object): def __init__(self): self._set_logger() self.msg = '' self._init_debugging_api() self.debugging_api = self._init_debugging_api() self._set_func_name_dict() self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() # 插入数据到goods_id_and_keyword_middle_table表 self.add_keyword_id_for_goods_id_sql_str = r'insert into dbo.goods_id_and_keyword_middle_table(goods_id, keyword_id) VALUES (%s, %s)' def _set_logger(self): self.my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/goods_keywords/_/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR ) def _init_debugging_api(self): ''' 用于设置待抓取的商品的site_id :return: dict ''' return { 1: False, 2: False, 3: True, 4: False, } def _set_func_name_dict(self): self.func_name_dict = { 'taobao': 'self._taobao_keywords_spider(goods_id_list={0}, keyword_id={1})', 'ali': 'self._ali_keywords_spider(goods_id_list={0}, keyword_id={1})', 'tmall': 'self._tmall_keywords_spider(goods_id_list={0}, keyword_id={1})', 'jd': 'self._jd_keywords_spider(goods_id_list={0}, keyword_id={1})' } def _just_run(self): while True: # 获取keywords sql_str = r'select id, keyword from dbo.goods_keywords where is_delete=0' # 获取原先goods_db的所有已存在的goods_id sql_str_2 = r'select GoodsID from dbo.GoodsInfoAutoGet' try: result = list(self.my_pipeline._select_table(sql_str=sql_str)) self.my_lg.info('正在获取db中已存在的goods_id...') result_2 = list(self.my_pipeline._select_table(sql_str=sql_str_2)) self.my_lg.info('db中已存在的goods_id获取成功!') except TypeError: self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None result_2 = None if result is not None and result_2 is not None: self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') self.my_lg.info(str(result)) self.my_lg.info('--------------------------------------------------------') self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) self.add_goods_index = 0 # 用于定位增加商品的个数 self.db_existed_goods_id_list = [item[0] for item in result_2] # 即时释放资源 try: del result_2 except: pass gc.collect() for type, type_value in self.debugging_api.items(): # 遍历待抓取的电商分类 if type_value is False: self.my_lg.info('api为False, 跳过!') continue for item in result: # 遍历每个关键字 self.my_lg.info('正在处理id为{0}, 关键字为 {1} ...'.format(item[0], item[1])) if self.add_goods_index % 20 == 0: self.my_lg.info('my_pipeline客户端重连中...') try: del self.my_pipeline except: pass self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('my_pipeline客户端重连完毕!') goods_id_list = self._get_keywords_goods_id_list(type=type, keyword=item) self.my_lg.info('关键字为{0}, 获取到的goods_id_list 如下: {1}'.format(item[1], str(goods_id_list))) '''处理goods_id_list''' self._deal_with_goods_id_list( type=type, goods_id_list=goods_id_list, keyword_id=item[0] ) sleep(5) def _get_keywords_goods_id_list(self, type, keyword): ''' 获取goods_id_list :param type: 电商种类 :param keyword: :return: ''' if type == 1: self.my_lg.info('下面是淘宝的关键字采集...') goods_id_list = self._get_taobao_goods_keywords_goods_id_list(keyword=keyword) elif type == 2: self.my_lg.info('下面是阿里1688的关键字采集...') goods_id_list = self._get_1688_goods_keywords_goods_id_list(keyword=keyword) elif type == 3: self.my_lg.info('下面是天猫的关键字采集...') goods_id_list = self._get_tmall_goods_keywords_goods_id_list(keyword=keyword) else: goods_id_list = [] return goods_id_list def _deal_with_goods_id_list(self, **kwargs): ''' 分类执行代码 :param kwargs: :return: ''' type = kwargs.get('type', '') goods_id_list = kwargs.get('goods_id_list', []) keyword_id = kwargs.get('keyword_id', '') if type == 1: self._taobao_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) elif type == 2: self._1688_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) elif type == 3: self._tmall_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) elif type == 4: pass else: pass return None def _get_taobao_goods_keywords_goods_id_list(self, keyword): ''' 获取该keywords的商品的goods_id_list :param keyword: (id, keyword) :return: a list ''' headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': HEADERS[randint(0, len(HEADERS)-1)], 'accept': '*/*', # 'referer': 'https://s.taobao.com/search?q=%E8%BF%9E%E8%A1%A3%E8%A3%99%E5%A4%8F&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306', 'authority': 's.taobao.com', # 'cookie': 't=70c4fb481898a67a66d437321f7b5cdf; cna=nbRZExTgqWsCAXPCa6QA5B86; l=AkFBuFEM2rj4GbU8Mjl3KsFo0YZa/7Vg; thw=cn; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _cc_=UIHiLt3xSw%3D%3D; tg=0; enc=OFbfiyN19GGi1GicxsjVmrZoFzlt9plbuviK5OuthXYfocqTD%2BL079G%2BIt4OMg6ZrbV4veSg5SQEpzuMUgLe0w%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; miid=763730917900964122; mt=ci%3D-1_1; linezing_session=i72FGC0gr3GTls7K7lswxen2_1527664168714VAPN_1; cookie2=1cf9585e0c6d98c72c64beac41a68107; v=0; _tb_token_=5ee03e566b165; uc1=cookie14=UoTeOZOVOtrsVw%3D%3D; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _m_h5_tk=14984d833a4647c13d4207c86d0dbd97_1528036508423; _m_h5_tk_enc=a8709d79a833625dc5c42b778ee7f1ee; JSESSIONID=F57610F0B34140EDC9F242BEA0F4800A; isg=BLm5VsJ0xr4M-pvu-R_LcQkeyCNTbqwVe7qvs9vvJODVYtj0JBZ5Sd704WaUEkWw', } # 获取到的为淘宝关键字搜索按销量排名 params = ( ('data-key', 'sort'), ('data-value', 'sale-desc'), ('ajax', 'true'), # ('_ksTS', '1528171408340_395'), ('callback', 'jsonp396'), ('q', keyword[1]), ('imgfile', ''), ('commend', 'all'), ('ssid', 's5-e'), ('search_type', 'item'), ('sourceId', 'tb.index'), # ('spm', 'a21bo.2017.201856-taobao-item.1'), ('ie', 'utf8'), # ('initiative_id', 'tbindexz_20170306'), ) s_url = 'https://s.taobao.com/search' body = MyRequests.get_url_body(url=s_url, headers=headers, params=params) if body == '': return [] else: try: data = re.compile('\((.*)\)').findall(body)[0] except IndexError: self.my_lg.error('re获取淘宝data时出错, 出错关键字为{0}'.format(keyword[1])) return [] data = self.json_str_2_dict(json_str=data) if data == {}: self.my_lg.error('获取到的淘宝搜索data为空dict! 出错关键字为{0}'.format(keyword[1])) return [] else: goods_id_list = data.get('mainInfo', {}).get('traceInfo', {}).get('traceData', {}).get('allNids', []) if goods_id_list is None or goods_id_list == []: self.my_lg.error('获取淘宝搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1])) return [] else: return goods_id_list def _get_1688_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取1688销量靠前的商品信息 :param keyword: :return: a list eg: ['11111', ...] ''' '''方案1: 从m.1688.com搜索页面进行抓取, 只取第一页的销量排名靠前的商品''' headers = { 'authority': 'm.1688.com', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': HEADERS[randint(0, len(HEADERS)-1)], 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; ali_ab=113.215.180.118.1523857816418.4; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; _csrf_token=1528708263870; JSESSIONID=9L783sX92-8iXZBHLCgK4fJiFKG9-W66WeuQ-BRgo4; hng=CN%7Czh-CN%7CCNY%7C156; t=70c4fb481898a67a66d437321f7b5cdf; _tb_token_=5ee03e566b165; __cn_logon__=false; h_keys="aa#2018%u5973%u88c5t%u6064"; alicnweb=homeIdttS%3D38414563432175544705031886000168094537%7Ctouch_tb_at%3D1528767881872%7ChomeIdttSAction%3Dtrue; ctoken=YnzGSFi23yEECqVO988Gzealot; _m_h5_tk=1cdad4dba1f1502fb29f57b3f73f5610_1528770803659; _m_h5_tk_enc=64259ec4fe4c33bc4555166994ed7b4d; __cn_logon__.sig=i6UL1cVhdIpbPPA_02yGiEyKMeZR2hBfnaoYK1CcrF4; ali_apache_id=11.182.158.193.1528768195886.327406.1; XSRF-TOKEN=b84fcec8-8bdf-41a5-a5c1-f8d6bfc9f83e; _tmp_ck_0=IlQ2M6x9F5xTkEpGRay66FVl%2BBaIEY076xELE8UtaLcz%2BgR%2FJ2UZOfDeKILA7R2VgXEJ7VYCkEQjS1RcUCwfL%2Br8ZFi0vwyVwyNpQsD2QG0HaihwedkkF9Cp9Ww0Jr%2BZF4la9CTe0AY8d1E1lDF91tD7lMAKIGVSne3V95CfI8VzpiWJ415B1IA0cc9J6IpYzn0mT1xLYnXcBAkDq0gop74NaynWIxw%2BLqmnXr%2BYU2bkOyMxZOBVY9B%2Bb0FU82h3TC9HCM8dGLnK2kxlgR%2B5lyT%2BCCFhhIX%2FioEMtA0TvDpXvRSUKoDTQG%2FCeJiKfy3LxMXmcTs5TBuWkh31F8nDCpLf6%2FlYOGkqeV1WLJeYXVe3SBvZC2O2JcYBQaKHcesETe%2FwTJL1fyc%3D; ad_prefer="2018/06/12 10:18:21"; webp=1; isg=BJWVxP7WYsuzzEf8vnJ3nRJEpJdFFdP4_0ZTRxc4b4wzbrxg3ONSdf5sPHJY2WFc; ali-ss=eyJ1c2VySWQiOm51bGwsImxvZ2luSWQiOm51bGwsInNpZCI6bnVsbCwiZWNvZGUiOm51bGwsIm1lbWJlcklkIjpudWxsLCJzZWNyZXQiOiJ5V3I0UVJGelVSVGp4dWs4aUxPWGl4dDIiLCJfZXhwaXJlIjoxNTI4ODU3MDE5ODMzLCJfbWF4QWdlIjo4NjQwMDAwMH0=; ali-ss.sig=z0qrG8Cj9BhDL_CLwTzgBGcdjSOXtp6YLxgDdTQRcWE', } params = ( ('sortType', 'booked'), ('filtId', ''), ('keywords', keyword[1]), ('descendOrder', 'true'), ) url = 'https://m.1688.com/offer_search/-6161.html' body = MyRequests.get_url_body(url=url, headers=headers, params=params) # self.my_lg.info(str(body)) if body == '': return [] else: try: goods_id_list = Selector(text=body).css('div.list_group-item::attr("data-offer-id")').extract() # pprint(goods_id_list) except Exception as e: self.my_lg.exception(e) self.my_lg.error('获取1688搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1])) goods_id_list = [] return goods_id_list def _get_tmall_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取tmall销量靠前的商品 :param keyword: :return: list eg: ['//detail.tmall.com/item.htm?id=566978017832&skuId=3606684772412', ...] 不是返回goods_id ''' '''方案: tmall m站的搜索''' headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': HEADERS[randint(0, len(HEADERS)-1)], 'accept': '*/*', # 'referer': 'https://list.tmall.com/search_product.htm?q=%B0%A2%B5%CF%B4%EF%CB%B9&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=d', 'authority': 'list.tmall.com', # 'cookie': 'cna=nbRZExTgqWsCAXPCa6QA5B86; _med=dw:1280&dh:800&pw:2560&ph:1600&ist:0; cq=ccp%3D1; hng=CN%7Czh-CN%7CCNY%7C156; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; enc=zIc9Cy5z0iS95tACxeX82fUsJdrekjC6%2BomP3kNKji1Z9RKwOt%2Fysyyewwf8twcytUGt2yT9AlAh5ASUlds05g%3D%3D; t=70c4fb481898a67a66d437321f7b5cdf; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _tb_token_=5ee03e566b165; cookie2=1cf9585e0c6d98c72c64beac41a68107; tt=tmall-main; pnm_cku822=098%23E1hvHpvUvbpvUvCkvvvvvjiPPFcvsjYnn2dvljEUPmP9sj1HPFsWtj3EP25ptj3PiQhvCvvvpZptvpvhvvCvpvhCvvOv9hCvvvmtvpvIvvCvxQvvvUgvvhVXvvvCxvvvBZZvvUhpvvChiQvv9Opvvho5vvmC3UyCvvOCvhEC0nkivpvUvvCCEppK6NOEvpCWvKXQwCzE%2BFuTRogRD76fdigqb64B9C97%2Bul1B5c6%2Bu0OVC61D70O58TJOymQD40OeutYon29V3Q7%2B3%2Busj7J%2Bu0OaokQD40OeutYLpGCvvpvvPMM; res=scroll%3A990*6982-client%3A472*680-offset%3A472*6982-screen%3A1280*800; _m_h5_tk=69794695b8eeb690d3ef037f6780d514_1529036786907; _m_h5_tk_enc=3e31314740c37d1fb14a26989cdac03c; isg=BN_f5lvy-LULYv0VwEkGMp59bjVjxpc1-mcB0nEsew7VAP6CeRTDNl2Gx5Z-nAte', } params = { 'page_size': '20', 'page_no': '1', 'q': str(keyword[1]), 'type': 'p', 'spm': 'a220m.6910245.a2227oh.d100', 'from': 'mallfp..m_1_suggest', 'sort': 'd', } s_url = 'https://list.tmall.com/m/search_items.htm' body = MyRequests.get_url_body(url=s_url, headers=headers, params=params) # self.my_lg.info(str(body)) if body == '': return [] else: data = self.json_str_2_dict(json_str=body) if data == {}: self.my_lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format(keyword[1])) return [] else: _ = data.get('item', []) if _ is None or _ == []: self.my_lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1])) return [] try: goods_id_list = [str(item.get('url', '')) for item in _] except Exception as e: self.my_lg.exception(e) self.my_lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1])) return [] return goods_id_list def _taobao_keywords_spider(self, **kwargs): ''' 抓取goods_id_list的数据,并存储 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = ['https://item.taobao.com/item.htm?id=' + item for item in goods_id_list] self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...') result = False # 用于判断某个goods是否被插入的参数 for item in goods_url_list: # item为goods_url try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.my_lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: taobao = TaoBaoLoginAndParse(logger=self.my_lg) if self.add_goods_index % 20 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = taobao.get_goods_id_from_url(item) if goods_id == '': self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(self.add_goods_index))) tt = taobao.get_goods_data(goods_id) data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(goods_id) data['username'] = '******' data['main_goods_id'] = None # print('------>>>| 爬取到的数据为: ', data) result = taobao.old_taobao_goods_insert_into_new_table(data, pipeline=self.my_pipeline) else: pass else: # 表示返回的data值为空值 self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table(goods_id=goods_id, keyword_id=keyword_id) else: pass self.my_lg.info('该关键字的商品已经抓取完毕!') return True def _1688_keywords_spider(self, **kwargs): ''' 1688对应关键字的商品信息抓取存储 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = ['https://detail.1688.com/offer/{0}.html'.format(item) for item in goods_id_list] self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...') result = False for item in goods_url_list: try: goods_id = re.compile('offer/(.*?).html').findall(item)[0] except IndexError: self.my_lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: ali_1688 = ALi1688LoginAndParse() if self.add_goods_index % 20 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = ali_1688.get_goods_id_from_url(item) if goods_id == '': self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(self.add_goods_index))) tt = ali_1688.get_ali_1688_data(goods_id) if tt.get('is_delete') == 1 and tt.get('before') is False: # 处理已下架的但是还是要插入的 # 下架的商品就pass continue data = ali_1688.deal_with_data() if data != {}: data['goods_id'] = goods_id data['goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html' data['username'] = '******' data['main_goods_id'] = None result = ali_1688.old_ali_1688_goods_insert_into_new_table(data=data, pipeline=self.my_pipeline) else: pass else: # 表示返回的data值为空值 self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 try: del ali_1688 except: pass gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table(goods_id=goods_id, keyword_id=keyword_id) else: pass self.my_lg.info('该关键字的商品已经抓取完毕!') return True def _tmall_keywords_spider(self, **kwargs): ''' tmall对应关键字采集 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = ['https:' + re.compile('&skuId=.*').sub('', item) for item in goods_id_list] self.my_lg.info('即将开始抓取该关键字的goods, 请耐心等待...') result = False # 用于判断某个goods是否被插入的参数 for item in goods_url_list: # item为goods_url try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.my_lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: tmall = TmallParse(logger=self.my_lg) if self.add_goods_index % 20 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = tmall.get_goods_id_from_url(item) if goods_id == []: self.my_lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = tmall.get_goods_data(goods_id) data = tmall.deal_with_data() goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data['goods_url'] = tmall._from_tmall_type_get_tmall_url(type=data['type'], goods_id=goods_id) if data['goods_url'] == '': self.my_lg.error('该goods_url为空值! 此处跳过!') continue result = tmall.old_tmall_goods_insert_into_new_table(data, pipeline=self.my_pipeline) else: pass else: self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 gc.collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table(goods_id=goods_id, keyword_id=keyword_id) else: pass self.my_lg.info('该关键字的商品已经抓取完毕!') return True def _insert_into_goods_id_and_keyword_middle_table(self, **kwargs): ''' 数据插入goods_id_and_keyword_middle_table :param kwargs: :return: ''' goods_id = str(kwargs['goods_id']) keyword_id = int(kwargs['keyword_id']) # self.my_lg.info(goods_id) # self.my_lg.info(keyword_id) result = False '''先判断中间表goods_id_and_keyword_middle_table是否已新增该关键字的id''' # 注意非完整sql语句不用r'', 而直接'' sql_str = 'select keyword_id from dbo.goods_id_and_keyword_middle_table where goods_id=%s' try: _ = self.my_pipeline._select_table(sql_str=sql_str, params=(goods_id,)) _ = [i[0] for i in _] # pprint(_) except Exception: self.my_lg.error('执行中间表goods_id_and_keyword_middle_table是否已新增该关键字的id的sql语句时出错, 跳过给商品加keyword_id') return result if keyword_id not in _: params = ( goods_id, keyword_id, ) self.my_lg.info('------>>>| 正在插入keyword_id为{0}, goods_id为{1}'.format(params[1], params[0])) result = self.my_pipeline._insert_into_table_2(sql_str=self.add_keyword_id_for_goods_id_sql_str, params=params, logger=self.my_lg) return result def json_str_2_dict(self, json_str): try: data = loads(json_str) except JSONDecodeError: self.my_lg.error('json转换字符串时出错, 请检查!') data = {} return data def __del__(self): try: del self.my_lg del self.msg del self.my_pipeline except: pass try: del self.db_existed_goods_id_list except: pass gc.collect()
class TaoBaoWeiTaoShareParse(): def __init__(self, logger=None): self._set_headers() self._set_logger(logger) self.msg = '' self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() def _set_headers(self): self.headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', 'referer': 'https://market.m.taobao.com/apps/market/content/index.html?ut_sk=1.VmYadv9DXkkDAFZm0VV4JBNq_21380790_1527298517854.Copy.33¶ms=%7B%22csid%22%3A%2254a52aea54b7c29d289a0e36b2bf2f51%22%7D&wh_weex=true&contentId=200668154273&source=weitao_2017_nocover&data_prefetch=true&suid=3D763077-A7BF-43BC-9092-C17B35E896F9&wx_navbar_transparent=false&wx_navbar_hidden=false&sourceType=other&un=bc80c9f324602d31384c4a342af87869&share_crt_v=1&sp_tk=o6R2Q0ZDMHZvaDBlS6Ok&cpp=1&shareurl=true&spm=a313p.22.68.948703884987&short_name=h.WAjz5RP&app=chrome', 'authority': 'h5api.m.taobao.com', # cookie得注释掉, 否则为非法请求 # 'cookie': 't=70c4fb481898a67a66d437321f7b5cdf; cna=nbRZExTgqWsCAXPCa6QA5B86; l=AkFBuFEM2rj4GbU8Mjl3KsFo0YZa/7Vg; thw=cn; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _cc_=UIHiLt3xSw%3D%3D; tg=0; enc=OFbfiyN19GGi1GicxsjVmrZoFzlt9plbuviK5OuthXYfocqTD%2BL079G%2BIt4OMg6ZrbV4veSg5SQEpzuMUgLe0w%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; miid=763730917900964122; mt=ci%3D-1_1; cookie2=16c0da3976ab60d7c87ef7cea1e83cb2; v=0; _tb_token_=dd9fe0edb4b3; tk_trace=oTRxOWSBNwn9dPy4KVJVbutfzK5InlkjwbWpxHegXyGxPdWTLVRjn23RuZzZtB1ZgD6Khe0jl%2BAoo68rryovRBE2Yp933GccTPwH%2FTbWVnqEfudSt0ozZPG%2BkA1iKeVv2L5C1tkul3c1pEAfoOzBoBsNsJyRfZ0FH5AEyz0CWtQgYlWnUAkbLeBYDpeNMwsdmBZ5GYwOAPdU1B2IUBU8G0MXGQCqFCjZt1pjb2TJN2uXIiZePpK9SWkwA%2FlD1sTTfYGTmnCo0YJ7IAG%2BnJtbITMYZ3mzYjFZtYlGojOqye861%2FNFDJbTR41FruF%2BHJRnt%2BHJNgFj3F7IDGXJCs8K; linezing_session=4ic7MPhjlPi65fN5BzW36xB7_1527299424026Fe7K_1; isg=BDo6U2SENb2uULiLxiJ4XA6ri2ZWbZPa3G9M1kQz602YN9pxLHsO1QBGg8PrpzZd; _m_h5_tk=53d85a4f43d72bc623586c142f0c5293_1527305714711; _m_h5_tk_enc=cc75764d122f72920ae715c9102701a8' } def _set_logger(self, logger): if logger is None: self.my_lg = set_logger( log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/微淘/' + str(get_shanghai_time())[0:10] + '.txt', console_log_level=INFO, file_log_level=ERROR ) else: self.my_lg = logger async def _get_target_url_and_content_id_and_csid(self, taobao_short_url): ''' 根据给与的淘宝分享短链接, 得到target_url, content_id, csid :param taobao_short_url: :return: ''' if re.compile(r'contentId').findall(taobao_short_url) != []: # 先检查是否已为目标地址 target_url = taobao_short_url else: body = MyRequests.get_url_body(url=taobao_short_url, headers=self.headers) # self.my_lg.info(str(body)) if body == '': self.my_lg.error('获取到的body为空值, 出错短链接地址: {0}'.format(str(taobao_short_url))) return '', '', '' try: # 获取短连接的目标地址 target_url = re.compile('var url = \'(.*?)\';').findall(body)[0] # self.my_lg.info(str(target_url)) except IndexError: self.my_lg.error('获取target_url的时候IndexError! 出错短链接地址: {0}'.format(str(taobao_short_url))) target_url = '' try: # 得到contentId content_id = re.compile('contentId=(\d+)').findall(target_url)[0] # self.my_lg.info(content_id) except IndexError: self.my_lg.error('获取content_id时IndexError! 出错短链接地址: {0}'.format(str(taobao_short_url))) content_id = '' try: # 得到csid csid = re.compile('csid%22%3A%22(.*?)%22%7D').findall(target_url)[0] # self.my_lg.info(csid) except IndexError: self.my_lg.info('此链接为无csid情况的链接...') # self.my_lg.error('获取csid时IndexError! 出错短链接地址: {0}'.format(str(taobao_short_url))) csid = '' try: tag_name = re.compile('tagName=(.*?)&').findall(target_url)[0] except IndexError: tag_name = '' try: tag = re.compile('tag=(.*?)&').findall(target_url)[0] except IndexError: tag = '' return target_url, content_id, csid, tag_name, tag async def _get_api_body(self, taobao_short_url): ''' 获取该页面api返回的文件 :param taobao_short_url: :return: body 类型 str ''' base_url = 'https://h5api.m.taobao.com/h5/mtop.taobao.beehive.detail.contentservicenewv2/1.0/' try: target_url, content_id, csid, tag_name, tag = await self._get_target_url_and_content_id_and_csid(taobao_short_url) except ValueError: self.my_lg.error('遇到ValueError!', exc_info=True) return '' if content_id == '' and csid == '': # 异常退出 return '' data = dumps({ 'businessSpm': '', 'business_spm': '', 'contentId': content_id, 'params': dumps({ "csid": csid, }) if csid != '' else '', # 没有csid时,就不传这个参数 'source': 'weitao_2017_nocover', 'tagName': tag_name, # 这个是我自己额外加的用于获取tags的api接口 'track_params': '', 'type': 'h5', }) params = { 'AntiCreep': 'true', 'AntiFlood': 'true', 'api': 'mtop.taobao.beehive.detail.contentservicenewv2', 'appKey': '12574478', 'callback': 'mtopjsonp1', # 'data': '{"contentId":"200668154273","source":"weitao_2017_nocover","type":"h5","params":"{\\"csid\\":\\"54a52aea54b7c29d289a0e36b2bf2f51\\"}","businessSpm":"","business_spm":"","track_params":""}', 'data': data, 'dataType': 'jsonp', 'data_2': '', 'jsv': '2.4.11', # 'sign': 'e8cb623e58bab0ceb10e9edffdacd5b2', # 't': '1527300457911', 'type': 'jsonp', 'v': '1.0' } result_1 = await get_taobao_sign_and_body( base_url=base_url, headers=self.headers, params=params, data=data, logger=self.my_lg ) _m_h5_tk = result_1[0] if _m_h5_tk == '': self.my_lg.error('获取到的_m_h5_tk为空str! 出错短链接地址: {0}'.format(taobao_short_url)) # 带上_m_h5_tk, 和之前请求返回的session再次请求得到需求的api数据 result_2 = await get_taobao_sign_and_body( base_url=base_url, headers=self.headers, params=params, data=data, _m_h5_tk=_m_h5_tk, session=result_1[1], logger=self.my_lg ) body = result_2[2] return body async def _deal_with_api_info(self, taobao_short_url): ''' 处理api返回的信息, 并结构化存储 :param taobao_short_url: :return: ''' data = await self._get_api_body(taobao_short_url) if data == '': self.my_lg.error('获取到的api数据为空值!') return {} try: data = re.compile('mtopjsonp1\((.*)\)').findall(data)[0] except IndexError: self.my_lg.error('re获取主信息失败, IndexError, 出错短链接地址:{0}'.format(taobao_short_url)) data = {} try: data = await self._wash_api_info(loads(data)) # pprint(data) except Exception as e: self.my_lg.error('出错短链接地址:{0}'.format(taobao_short_url)) self.my_lg.exception(e) return {} article = await self._get_article(data=data, taobao_short_url=taobao_short_url) pprint(article) if article != {} and article.get('share_id', '') != '': '''采集该文章推荐的商品''' await self._crawl_and_save_these_goods(goods_url_list=article.get('goods_url_list', [])) '''存储该文章info''' await self._save_this_article(article=article) return True else: self.my_lg.info('获取到的文章失败! article为空dict!') return False async def _crawl_and_save_these_goods(self, goods_url_list): ''' 采集该文章推荐的商品 :param goods_url_list: :return: ''' sql_str = 'select GoodsID from dbo.GoodsInfoAutoGet where SiteID=1 or SiteID=3 or SiteID=4 or SiteID=6' try: result = self.my_pipeline._select_table(sql_str=sql_str) except TypeError: result = [] self.my_lg.info('即将开始抓取该文章的goods, 请耐心等待...') index = 1 db_all_goods_id_list = [item[0] for item in result] for item in goods_url_list: try: goods_id = re.compile(r'id=(\d+)').findall(item.get('goods_url', ''))[0] except IndexError: self.my_lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in db_all_goods_id_list: self.my_lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) continue else: taobao = TaoBaoLoginAndParse(logger=self.my_lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.my_lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.my_lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = taobao.get_goods_id_from_url(item.get('goods_url', '')) if goods_id == '': self.my_lg.info('@@@ 原商品的地址为: {0}'.format(item.get('goods_url', ''))) continue else: self.my_lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(index))) tt = taobao.get_goods_data(goods_id) data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(goods_id) data['username'] = '******' data['main_goods_id'] = None # print('------>>>| 爬取到的数据为: ', data) taobao.old_taobao_goods_insert_into_new_table(data, pipeline=self.my_pipeline) else: pass else: # 表示返回的data值为空值 self.my_lg.info('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) self.my_lg.info('该文章的商品已经抓取完毕!') return True async def _save_this_article(self, article): ''' 存储该文章info :param article: :return: ''' sql_str = 'select share_id from dbo.daren_recommend' db_share_id = [j[0] for j in list(self.my_pipeline._select_table(sql_str=sql_str))] if article.get('share_id') in db_share_id: self.my_lg.info('该share_id({})已存在于数据库中, 此处跳过!'.format(article.get('share_id', ''))) return True else: self.my_lg.info('即将开始存储该文章...') if self.my_pipeline.is_connect_success: params = await self._get_db_insert_params(item=article) # pprint(params) sql_str = r'insert into dbo.daren_recommend(nick_name, head_url, profile, share_id, gather_url, title, comment_content, share_goods_base_info, div_body, create_time, site_id) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' self.my_pipeline._insert_into_table_2(sql_str=sql_str, params=params, logger=self.my_lg) return True else: self.my_lg.error('db连接失败!存储失败! 出错article地址:{0}'.format(article.get('gather_url', ''))) return False async def _get_db_insert_params(self, item): params = ( item['nick_name'], item['head_url'], item['profile'], item['share_id'], item['gather_url'], item['title'], item['comment_content'], # dumps(item['share_img_url_list'], ensure_ascii=False), # dumps(item['goods_id_list'], ensure_ascii=False), dumps(item['share_goods_base_info'], ensure_ascii=False), item['div_body'], item['create_time'], item['site_id'], ) return params async def _get_article(self, data, taobao_short_url): ''' 得到该文章的需求信息 :param data: :return: ''' try: nick_name = data.get('data', {}).get('models', {}).get('account', {}).get('name', '') assert nick_name != '', '获取到的nick_name为空值!' head_url = await self._get_head_url(data=data) # 推荐人的简介或者个性签名 tmp_profile = data.get('data', {}).get('models', {}).get('account', {}).get('accountDesc', '') profile = tmp_profile if tmp_profile is not None else '' title = self._wash_sensitive_info(data.get('data', {}).get('models', {}).get('content', {}).get('title', '')) # self.my_lg.info(title) assert title != '', '获取到的title为空值!请检查!' # 达人的评论,可用于荐好首页的文字信息 comment_content = self._wash_sensitive_info(data.get('data', {}).get('models', {}).get('content', {}).get('summary', '')) '''微淘抓包的接口: 图片,商品依次对应''' tmp_goods_list = data.get('data', {}).get('models', {}).get('content', {}).get('drawerList', []) assert tmp_goods_list != [], '获取到的goods_id_list为空list! 请检查! 可能该文章推荐商品为空[]!' share_img_url_list = [{'img_url': 'https:' + item.get('itemImages', [])[0].get('picUrl', '')} for item in tmp_goods_list] goods_id_list = [{'goods_id': item.get('itemId', '')} for item in tmp_goods_list] # 由于微淘的图片跟商品信息一一对应,so直接存一个字段, 清除重复的推荐商品(list去重,并保持原来的顺序) share_goods_base_info = list_duplicate_remove([{ 'img_url': 'https:' + item.get('itemImages', [])[0].get('picUrl', ''), 'goods_id': item.get('itemId', ''), } for item in tmp_goods_list]) # div_body div_body = self._wash_sensitive_info(await self._get_div_body(rich_text=data.get('data', {}).get('models', {}).get('content', {}).get('richText', []))) # print(div_body) # 待抓取的商品地址, 统一格式为淘宝的,如果是tmall地址, 浏览器会重定向到天猫 goods_url_list = [{'goods_url': 'https://item.taobao.com/item.htm?id=' + item.get('goods_id', '')} for item in goods_id_list] _ = (await self._get_target_url_and_content_id_and_csid(taobao_short_url)) gather_url = _[0] share_id = _[1] # 即content_id create_time = get_shanghai_time() site_id = 2 # 淘宝微淘 # tags 额外的文章地址 tags = await self._get_tags(data=data) # pprint(tags) except Exception as e: self.my_lg.error('出错短链接地址:{0}'.format(taobao_short_url)) self.my_lg.exception(e) return {} article = WellRecommendArticle() article['nick_name'] = nick_name article['head_url'] = head_url article['profile'] = profile article['share_id'] = share_id article['title'] = title article['comment_content'] = comment_content article['share_img_url_list'] = share_img_url_list article['goods_id_list'] = goods_id_list article['div_body'] = div_body article['gather_url'] = gather_url article['create_time'] = create_time article['site_id'] = site_id article['goods_url_list'] = goods_url_list article['tags'] = tags article['share_goods_base_info'] = share_goods_base_info return article async def _get_head_url(self, data): ''' 获取头像地址 :param data: :return: ''' tmp_head_url = data.get('data', {}).get('models', {}).get('account', {}).get('accountPic', {}).get('picUrl', '') if tmp_head_url != '': if re.compile('http').findall(tmp_head_url) == []: head_url = 'https:' + tmp_head_url else: head_url = tmp_head_url else: head_url = '' return head_url def _wash_sensitive_info(self, data): ''' 清洗敏感信息 :param data: :return: ''' data = re.compile('淘宝|天猫|taobao|tmall|TAOBAO|TMALL').sub('', data) return data async def _get_tags(self, data): ''' 获得额外文章的信息 :param data: :return: ''' tags = data.get('data', {}).get('models', {}).get('tags', []) tags = [{ 'url': unquote(item.get('url', '')), 'name': item.get('name', ''), } for item in tags] return tags async def _get_div_body(self, rich_text): ''' 处理得到目标文章 :param rich_text: 待处理的原文章 :return: ''' div_body = '' for item in rich_text: if item.get('resource') is None: continue for resource_item in item.get('resource', []): # 可能是多个 # resource = item.get('resource', [])[0] text = resource_item.get('text', '') # 介绍的文字 picture = resource_item.get('picture', {}) # 介绍的图片 _goods = resource_item.get('item', {}) # 一个商品 if text != '': text = '<p style="height:auto;width:100%">' + text + '</p>' + '<br>' div_body += text continue if picture != {}: # 得到该图片的宽高,并得到图片的<img>标签 _ = r'<img src="{0}" style="height:{1}px;width:{2}px;"/>'.format( 'https:' + picture.get('picUrl', ''), picture.get('picHeight', ''), picture.get('picWidth', '') ) _ = _ + '<br>' div_body += _ continue if _goods != {}: _hiden_goods_id = r'<p style="display:none;">此处有个商品[goods_id]: {0}</p>'.format(_goods.get('itemId', '')) + '<br>' div_body += _hiden_goods_id continue return '<div>' + div_body + '</div>' if div_body != '' else '' async def _wash_api_info(self, data): ''' 清洗接口 :param data: :return: ''' try: data['data']['assets'] = [] data['data']['models']['config'] = {} data['data']['modules'] = [] except Exception: pass return data def __del__(self): try: del self.my_lg del self.msg del self.my_pipeline except: pass gc.collect()
class GoodsKeywordsSpider(Crawler): def __init__(self): super(GoodsKeywordsSpider, self).__init__( ip_pool_type=IP_POOL_TYPE, log_print=True, logger=None, log_save_path=MY_SPIDER_LOGS_PATH + '/goods_keywords/_/', ) self.msg = '' self._init_debugging_api() self.debugging_api = self._init_debugging_api() self._set_func_name_dict() self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() # 插入数据到goods_id_and_keyword_middle_table表 self.add_keyword_id_for_goods_id_sql_str = kw_insert_str_1 def _init_debugging_api(self): ''' 用于设置crawl的关键字热销商品的site_id :return: dict ''' return { 1: True, # 淘宝 2: True, # 阿里1688 3: True, # 天猫 4: True, # 京东 } def _set_func_name_dict(self): self.func_name_dict = { 'taobao': 'self._taobao_keywords_spider(goods_id_list={0}, keyword_id={1})', 'ali': 'self._ali_keywords_spider(goods_id_list={0}, keyword_id={1})', 'tmall': 'self._tmall_keywords_spider(goods_id_list={0}, keyword_id={1})', 'jd': 'self._jd_keywords_spider(goods_id_list={0}, keyword_id={1})' } def _just_run(self): while True: # 获取原先goods_db的所有已存在的goods_id try: result = list( self.my_pipeline._select_table(sql_str=kw_select_str_1)) self.lg.info('正在获取db中已存在的goods_id...') result_2 = list( self.my_pipeline._select_table(sql_str=kw_select_str_2)) self.lg.info('db中已存在的goods_id获取成功!') except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') result = None result_2 = None if result is not None and result_2 is not None: self.lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------') self.lg.info(str(result)) self.lg.info( '--------------------------------------------------------') self.lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#')) self.add_goods_index = 0 # 用于定位增加商品的个数 self.db_existed_goods_id_list = [item[0] for item in result_2] # 即时释放资源 try: del result_2 except: pass collect() for item in result: # 每个关键字在True的接口都抓完, 再进行下一次 self.lg.info('正在处理id为{0}, 关键字为 {1} ...'.format( item[0], item[1])) for type, type_value in self.debugging_api.items( ): # 遍历待抓取的电商分类 if type_value is False: self.lg.info('api为False, 跳过!') continue self.my_pipeline = _block_get_new_db_conn( db_obj=self.my_pipeline, index=self.add_goods_index, logger=self.lg, remainder=20, ) goods_id_list = self._get_keywords_goods_id_list( type=type, keyword=item) self.lg.info( '关键字为{0}, 获取到的goods_id_list 如下: {1}'.format( item[1], str(goods_id_list))) '''处理goods_id_list''' self._deal_with_goods_id_list( type=type, goods_id_list=goods_id_list, keyword_id=item[0]) sleep(3) def _get_keywords_goods_id_list(self, type, keyword): ''' 获取goods_id_list :param type: 电商种类 :param keyword: :return: ''' if type == 1: self.lg.info('下面是淘宝的关键字采集...') goods_id_list = self._get_taobao_goods_keywords_goods_id_list( keyword=keyword) elif type == 2: self.lg.info('下面是阿里1688的关键字采集...') goods_id_list = self._get_1688_goods_keywords_goods_id_list( keyword=keyword) elif type == 3: self.lg.info('下面是天猫的关键字采集...') goods_id_list = self._get_tmall_goods_keywords_goods_id_list( keyword=keyword) elif type == 4: self.lg.info('下面是京东的关键字采集...') goods_id_list = self._get_jd_goods_keywords_goods_id_list( keyword=keyword) else: goods_id_list = [] return goods_id_list def _deal_with_goods_id_list(self, **kwargs): ''' 分类执行代码 :param kwargs: :return: ''' type = kwargs.get('type', '') goods_id_list = kwargs.get('goods_id_list', []) keyword_id = kwargs.get('keyword_id', '') if type == 1: self._taobao_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) elif type == 2: self._1688_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) elif type == 3: self._tmall_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) elif type == 4: self._jd_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) else: pass return None def _get_taobao_goods_keywords_goods_id_list(self, keyword): ''' 获取该keywords的商品的goods_id_list :param keyword: (id, keyword) :return: a list ''' headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', # 'referer': 'https://s.taobao.com/search?q=%E8%BF%9E%E8%A1%A3%E8%A3%99%E5%A4%8F&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306', 'authority': 's.taobao.com', } # 获取到的为淘宝关键字搜索按销量排名 params = ( ('data-key', 'sort'), ('data-value', 'sale-desc'), ('ajax', 'true'), # ('_ksTS', '1528171408340_395'), ('callback', 'jsonp396'), ('q', keyword[1]), ('imgfile', ''), ('commend', 'all'), ('ssid', 's5-e'), ('search_type', 'item'), ('sourceId', 'tb.index'), # ('spm', 'a21bo.2017.201856-taobao-item.1'), ('ie', 'utf8'), # ('initiative_id', 'tbindexz_20170306'), ) s_url = 'https://s.taobao.com/search' body = Requests.get_url_body(url=s_url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) if body == '': return [] else: try: data = re.compile('\((.*)\)').findall(body)[0] except IndexError: self.lg.error('re获取淘宝data时出错, 出错关键字为{0}'.format(keyword[1])) return [] data = json_2_dict(json_str=data, logger=self.lg) if data == {}: self.lg.error('获取到的淘宝搜索data为空dict! 出错关键字为{0}'.format( keyword[1])) return [] else: goods_id_list = data.get('mainInfo', {}).get( 'traceInfo', {}).get('traceData', {}).get('allNids', []) if goods_id_list is None or goods_id_list == []: self.lg.error('获取淘宝搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) return [] else: return goods_id_list def _get_1688_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取1688销量靠前的商品信息 :param keyword: :return: a list eg: ['11111', ...] ''' '''方案1: 从m.1688.com搜索页面进行抓取, 只取第一页的销量排名靠前的商品''' headers = { 'authority': 'm.1688.com', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': get_random_pc_ua(), 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', } params = ( ('sortType', 'booked'), ('filtId', ''), ('keywords', keyword[1]), ('descendOrder', 'true'), ) url = 'https://m.1688.com/offer_search/-6161.html' body = Requests.get_url_body(url=url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) if body == '': return [] else: try: goods_id_list = Selector(text=body).css( 'div.list_group-item::attr("data-offer-id")').extract() # pprint(goods_id_list) except Exception as e: self.lg.exception(e) self.lg.error('获取1688搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) goods_id_list = [] return goods_id_list def _get_tmall_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取tmall销量靠前的商品 :param keyword: :return: list eg: ['//detail.tmall.com/item.htm?id=566978017832&skuId=3606684772412', ...] 不是返回goods_id ''' '''方案: tmall m站的搜索''' # 搜索: 偶尔不稳定但是还是能用 headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', # 'referer': 'https://list.tmall.com/search_product.htm?q=%B0%A2%B5%CF%B4%EF%CB%B9&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=d', 'authority': 'list.tmall.com', } params = { 'page_size': '20', 'page_no': '1', 'q': str(keyword[1]), 'type': 'p', 'spm': 'a220m.6910245.a2227oh.d100', 'from': 'mallfp..m_1_suggest', 'sort': 'd', } s_url = 'https://list.tmall.com/m/search_items.htm' body = Requests.get_url_body(url=s_url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) if body == '': return [] else: data = json_2_dict(json_str=body, logger=self.lg) if data == {}: self.lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format( keyword[1])) return [] else: _ = data.get('item', []) if _ is None or _ == []: self.lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) return [] try: goods_id_list = [str(item.get('url', '')) for item in _] except Exception as e: self.lg.exception(e) self.lg.error('获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format( keyword[1])) return [] return goods_id_list def _get_jd_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取京东销量靠前的商品 :param keyword: :return: [] or ['xxxx', ....] ''' # 方案1: jd m站的搜索(基于搜索接口) headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', # 'referer': 'https://so.m.jd.com/ware/search.action?keyword=b&area_ids=1,72,2819&sort_type=sort_totalsales15_desc&qp_disable=no&fdesc=%E5%8C%97%E4%BA%AC&t1=1529934870416', 'authority': 'so.m.jd.com', } params = ( ('keyword', keyword[1]), ('datatype', '1'), ('callback', 'jdSearchResultBkCbA'), ('page', '1'), ('pagesize', '10'), ('ext_attr', 'no'), ('brand_col', 'no'), ('price_col', 'no'), ('color_col', 'no'), ('size_col', 'no'), ('ext_attr_sort', 'no'), ('merge_sku', 'yes'), ('multi_suppliers', 'yes'), ('area_ids', '1,72,2819'), ('sort_type', 'sort_totalsales15_desc'), ('qp_disable', 'no'), ('fdesc', '\u5317\u4EAC'), # ('t1', '1529934992189'), ) s_url = 'https://so.m.jd.com/ware/search._m2wq_list' body = Requests.get_url_body(url=s_url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) if body == '': return [] else: try: data = re.compile('jdSearchResultBkCbA\((.*)\)').findall( body)[0] except IndexError: self.lg.error('获取jd的关键字数据时, IndexError! 出错关键字为{0}'.format( (keyword[1]))) return [] '''问题在于编码中是\xa0之类的,当遇到有些 不用转义的\http之类的,则会出现以上错误。''' data = deal_with_JSONDecodeError_about_value_invalid_escape( json_str=data) data = json_2_dict(json_str=data, logger=self.lg) if data == {}: self.lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format( keyword[1])) return [] else: # 注意拿到的数据如果是京东拼购则跳过 # pprint(data) data = data.get('data', {}).get('searchm', {}).get('Paragraph', []) # pingou中字段'bp'不为空即为拼购商品,抓取时不抓取拼购商品, 即'pingou_price': item.get('pinGou', {}).get('bp', '') == '' if data is not None and data != []: goods_id_list = [ item.get('wareid', '') for item in data if item.get('pinGou', {}).get('bp', '') == '' ] return goods_id_list else: self.lg.error('获取到的data为空list, 请检查!') return [] def _taobao_keywords_spider(self, **kwargs): ''' 抓取goods_id_list的数据,并存储 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = [ 'https://item.taobao.com/item.htm?id=' + item for item in goods_id_list ] self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url result = False # 用于判断某个goods是否被插入的参数 try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: taobao = TaoBaoLoginAndParse(logger=self.lg) if self.add_goods_index % 20 == 0: # 每50次重连一次,避免单次长连无响应报错 self.lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = taobao.get_goods_id_from_url(item) if goods_id == '': self.lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(self.add_goods_index))) tt = taobao.get_goods_data(goods_id) data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) data['username'] = '******' data['main_goods_id'] = None # print('------>>>| 爬取到的数据为: ', data) result = taobao.old_taobao_goods_insert_into_new_table( data, pipeline=self.my_pipeline) else: pass else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.lg.info('该关键字的商品已经抓取完毕!') return True def _1688_keywords_spider(self, **kwargs): ''' 1688对应关键字的商品信息抓取存储 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = [ 'https://detail.1688.com/offer/{0}.html'.format(item) for item in goods_id_list ] self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: result = False # 每次重置 try: goods_id = re.compile('offer/(.*?).html').findall(item)[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: ali_1688 = ALi1688LoginAndParse(logger=self.lg) if self.add_goods_index % 20 == 0: # 每50次重连一次,避免单次长连无响应报错 self.lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = ali_1688.get_goods_id_from_url(item) if goods_id == '': self.lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(self.add_goods_index))) tt = ali_1688.get_ali_1688_data(goods_id) if tt.get('is_delete') == 1 and tt.get( 'before') is False: # 处理已下架的但是还是要插入的 # 下架的商品就pass continue data = ali_1688.deal_with_data() if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html' data['username'] = '******' data['main_goods_id'] = None result = ali_1688.old_ali_1688_goods_insert_into_new_table( data=data, pipeline=self.my_pipeline) else: pass else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 try: del ali_1688 except: pass collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.lg.info('该关键字的商品已经抓取完毕!') return True def _tmall_keywords_spider(self, **kwargs): """ tmall对应关键字采集 :param kwargs: :return: """ goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = [ 'https:' + re.compile('&skuId=.*').sub('', item) for item in goods_id_list ] self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url result = False # 用于判断某个goods是否被插入的参数 try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: tmall = TmallParse(logger=self.lg) if self.add_goods_index % 20 == 0: # 每20次重连一次,避免单次长连无响应报错 self.lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = tmall.get_goods_id_from_url(item) if goods_id == []: self.lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = tmall.get_goods_data(goods_id) data = tmall.deal_with_data() goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data[ 'goods_url'] = tmall._from_tmall_type_get_tmall_url( type=data['type'], goods_id=goods_id) if data['goods_url'] == '': self.lg.error('该goods_url为空值! 此处跳过!') continue result = tmall.old_tmall_goods_insert_into_new_table( data, pipeline=self.my_pipeline) else: pass else: self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.lg.info('该关键字的商品已经抓取完毕!') return True def _jd_keywords_spider(self, **kwargs): ''' jd对应关键字采集 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') '''初始地址可以直接用这个[https://item.jd.com/xxxxx.html]因为jd会给你重定向到正确地址, 存也可以存这个地址''' # 所以这边jd就不分类存,一律存为常规商品site_id = 7 goods_url_list = [ 'https://item.jd.com/{0}.html'.format(str(item)) for item in goods_id_list ] self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url result = False # 用于判断某个goods是否被插入db的参数 try: goods_id = re.compile('\/(\d+)\.html').findall(item)[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: jd = JdParse(logger=self.lg) if self.add_goods_index % 20 == 0: # 每20次重连一次,避免单次长连无响应报错 self.lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = jd.get_goods_id_from_url(item) if goods_id == []: self.lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = jd.get_goods_data(goods_id) data = jd.deal_with_data(goods_id) goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data['goods_url'] = item result = jd.old_jd_goods_insert_into_new_table( data, self.my_pipeline) else: pass else: self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 sleep(1) try: del jd except: pass collect() if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.lg.info('该关键字的商品已经抓取完毕!') return True def _insert_into_goods_id_and_keyword_middle_table(self, **kwargs): ''' 数据插入goods_id_and_keyword_middle_table :param kwargs: :return: ''' goods_id = str(kwargs['goods_id']) keyword_id = int(kwargs['keyword_id']) # self.lg.info(goods_id) # self.lg.info(keyword_id) result = False '''先判断中间表goods_id_and_keyword_middle_table是否已新增该关键字的id''' # 注意非完整sql语句不用r'', 而直接'' try: _ = self.my_pipeline._select_table(sql_str=kw_select_str_3, params=(goods_id, )) _ = [i[0] for i in _] # pprint(_) except Exception: self.lg.error( '执行中间表goods_id_and_keyword_middle_table是否已新增该关键字的id的sql语句时出错, 跳过给商品加keyword_id' ) return result if keyword_id not in _: params = ( goods_id, keyword_id, ) self.lg.info('------>>>| 正在插入keyword_id为{0}, goods_id为{1}'.format( params[1], params[0])) result = self.my_pipeline._insert_into_table_2( sql_str=self.add_keyword_id_for_goods_id_sql_str, params=params, logger=self.lg) return result def _add_keyword_2_db_from_excel_file(self): ''' 从excel插入新关键字到db :return: ''' excel_file_path = '/Users/afa/Desktop/2018-07-18-淘宝phone-top20万.xlsx' self.lg.info('正在读取{0}, 请耐心等待...'.format(excel_file_path)) try: excel_result = read_info_from_excel_file( excel_file_path=excel_file_path) except Exception: self.lg.error('遇到错误:', exc_info=True) return False self.lg.info('读取完毕!!') self.lg.info('正在读取db中原先的keyword...') db_keywords = self.my_pipeline._select_table(sql_str=kw_select_str_4) db_keywords = [i[0] for i in db_keywords] self.lg.info('db keywords 读取完毕!') for item in excel_result: keyword = item.get('关键词', None) if not keyword: continue if keyword in db_keywords: self.lg.info('该关键字{0}已经存在于db中...'.format(keyword)) continue self.lg.info('------>>>| 正在存储关键字 {0}'.format(keyword)) self.my_pipeline._insert_into_table_2(sql_str=kw_insert_str_2, params=(str(keyword), 0), logger=self.lg) self.lg.info('全部写入完毕!') return True def __del__(self): try: del self.lg del self.msg del self.my_pipeline except: pass try: del self.db_existed_goods_id_list except: pass collect()
class GoodsKeywordsSpider(AsyncCrawler): def __init__(self): super(GoodsKeywordsSpider, self).__init__( ip_pool_type=IP_POOL_TYPE, log_print=True, logger=None, log_save_path=MY_SPIDER_LOGS_PATH + '/goods_keywords/_/', ) self.msg = '' self.debugging_api = self._init_debugging_api() self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() # 插入数据到goods_id_and_keyword_middle_table表 self.add_keyword_id_for_goods_id_sql_str = kw_insert_str_1 self.req_num_retries = 7 def _init_debugging_api(self): ''' 用于设置crawl的关键字热销商品的site_id :return: dict ''' return { 1: True, # 淘宝 2: False, # 阿里1688 3: True, # 天猫 4: False, # 京东 } async def _fck_run(self): # 存入pc tb的主分类keywords # target_keywords_list = self.get_pc_tb_sort_keywords_list() # assert target_keywords_list != [] # self.add_new_keywords_list_2_db(target_list=target_keywords_list) pass def _just_run(self): while True: result = None result_2 = None # 获取原先goods_db的所有已存在的goods_id try: result = list(self.sql_cli._select_table(sql_str=kw_select_str_1)) self.lg.info('正在获取db中已存在的goods_id...') result_2 = list(self.sql_cli._select_table(sql_str=kw_select_str_2)) self.lg.info('db中已存在的goods_id获取成功!') except TypeError: self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)') if result is None or result_2 is None: sleep(15) continue self.lg.info('db 已存在的goods_id_num: {}'.format(len(result_2))) # 用于定位增加商品的个数 self.add_goods_index = 0 self.db_existed_goods_id_list = [item[0] for item in result_2] # 即时释放资源 try: del result_2 except: pass collect() for item in result: keyword_id = item[0] keyword = item[1] # 每个关键字在True的接口都抓完, 再进行下一次 self.lg.info('正在处理id为{0}, 关键字为 {1} ...'.format(keyword_id, keyword)) # 筛选 if int(keyword_id) < 43: if int(keyword_id) not in (25, 26): self.lg.info('不在处理的keyword_id范围内, keyword_id: {}, keyword: {}'.format( keyword_id, keyword)) continue else: pass else: pass for type, type_value in self.debugging_api.items(): # 遍历待抓取的电商分类 if type_value is False: self.lg.info('api为False, 跳过!') continue self.sql_cli = _block_get_new_db_conn( db_obj=self.sql_cli, index=self.add_goods_index, logger=self.lg, remainder=20,) goods_id_list = self._get_keywords_goods_id_list( type=type, keyword=item) # pprint(goods_id_list) self.lg.info('关键字为{0}, 获取到的goods_id_list_num: {1}'.format(keyword, len(goods_id_list))) '''处理goods_id_list''' self._deal_with_goods_id_list( type=type, goods_id_list=goods_id_list, keyword_id=keyword_id) sleep(3) @catch_exceptions_with_class_logger(default_res=[]) def get_pc_tb_sort_keywords_list(self) -> list: """ 获取pc tb 关键字 :return: """ # 存入数量较小, 避免长期增量导致后期更新量大 headers = get_random_headers( connection_status_keep_alive=False, cache_control='', ) body = Requests.get_url_body( url='https://www.taobao.com/', headers=headers, ip_pool_type=self.ip_pool_type, num_retries=self.req_num_retries, proxy_type=PROXY_TYPE_HTTPS,) assert body != '' # self.lg.info(body) # 只获取主分类的关键字 main_sort_key_list_sel = { 'method': 'css', 'selector': 'ul.service-bd li a ::text', } main_sort_list_key = parse_field( parser=main_sort_key_list_sel, target_obj=body, is_first=False, logger=self.lg, ) # pprint(main_sort_list_key) # 不需要的 not_need_main_sort_key_tuple = ( '卡券', '本地服务', 'DIY', '二手车', '生鲜', '鲜花', ) main_sort_list_key = list(tuple([item for item in main_sort_list_key if item not in not_need_main_sort_key_tuple])) # pprint(main_sort_list_key) return main_sort_list_key @catch_exceptions_with_class_logger(default_res=[]) def get_db_keywords_list(self) -> list: """ 获取db keyword :return: """ self.lg.info('正在读取db中原先的keyword...') db_res = self.sql_cli._select_table(sql_str=kw_select_str_4) res = [i[0] for i in db_res] self.lg.info('db keywords 读取完毕!') return res def add_new_keywords_list_2_db(self, target_list: list): """ 往db插入新的key :param target_list: :return: """ db_keywords_List = self.get_db_keywords_list() for keyword in target_list: if keyword in db_keywords_List: self.lg.info('该关键字{0}已经存在于db中...'.format(keyword)) continue self.lg.info('------>>>| 正在存储关键字 {0}'.format(keyword)) self.sql_cli._insert_into_table_2( sql_str=kw_insert_str_2, params=( str(keyword), 0 ), logger=self.lg,) self.lg.info('全部写入完毕!') def _get_keywords_goods_id_list(self, type, keyword): ''' 获取goods_id_list :param type: 电商种类 :param keyword: :return: ''' if type == 1: self.lg.info('下面是淘宝的关键字采集...') goods_id_list_0 = self._get_taobao_goods_keywords_goods_id_list( keyword=keyword, sort_order=0,) goods_id_list_1 = self._get_taobao_goods_keywords_goods_id_list( keyword=keyword, sort_order=1,) # 理论不重复 goods_id_list = goods_id_list_0 + goods_id_list_1 elif type == 2: self.lg.info('下面是阿里1688的关键字采集...') goods_id_list = self._get_1688_goods_keywords_goods_id_list(keyword=keyword) elif type == 3: self.lg.info('下面是天猫的关键字采集...') # goods_id_list_0 = [] goods_id_list_0 = self._get_tmall_goods_keywords_goods_id_list( keyword=keyword, sort_order=0) goods_id_list_1 = self._get_tmall_goods_keywords_goods_id_list( keyword=keyword, sort_order=1) # 理论不重复 goods_id_list = goods_id_list_0 + goods_id_list_1 elif type == 4: self.lg.info('下面是京东的关键字采集...') goods_id_list = self._get_jd_goods_keywords_goods_id_list(keyword=keyword) else: goods_id_list = [] return goods_id_list def _deal_with_goods_id_list(self, **kwargs): ''' 分类执行代码 :param kwargs: :return: ''' type = kwargs.get('type', '') goods_id_list = kwargs.get('goods_id_list', []) keyword_id = kwargs.get('keyword_id', '') if type == 1: self._taobao_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) elif type == 2: self._1688_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) elif type == 3: self._tmall_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) elif type == 4: self._jd_keywords_spider(goods_id_list=goods_id_list, keyword_id=keyword_id) else: pass return None @catch_exceptions_with_class_logger(default_res=[]) def _get_taobao_goods_keywords_goods_id_list(self, keyword, sort_order=0) -> list: ''' 获取该keywords的商品的goods_id_list :param keyword: (id, keyword) :param sort_order: 排序方式 0 销量排序 | 1 升序排序(低到高) :return: a list ''' # headers = get_random_headers( # connection_status_keep_alive=False, # ) # headers.update({ # 'authority': 's.taobao.com', # }) # # todo cookie必须, 且一会就失效, pass, 利用滑块, 发现selenium被封, 原因被检测是爬虫, 可尝试puppeteer, 据说可过, 未尝试! # cookie_str = '_cc_=VT5L2FSpdA%3D%3D;_fbp=fb.1.1569654479622.763934285;_l_g_=Ug%3D%3D;_m_h5_tk=a4c9f506a939350748c4e25bf29c8128_1572415413630;_m_h5_tk_enc=e1f102c6dddbcf0449ee462808f953f3;_nk_=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA;_tb_token_=fdeefbbf3de98;cna=wRsVFTj6JEoCAXHXtCqXOzC7;cookie1=UR3Wq2iKhDJHTTOd%2FGn4oh0oxwBK8EUqK%2Bm%2Bxv62FEM%3D;cookie17=UUplY9Ft9xwldQ%3D%3D;cookie2=192786d411defc9125f286b621cdac13;csg=d4dcf16f;dnk=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA;enc=NMn7zFLrgU6nMXwgPWND42Y2H3tmKu0Iel59hu%2B7DFx27uPqGw349h4yvXidY3xuFC6%2FjozpnaTic5LC7jv8CA%3D%3D;existShop=MTU3MjQwNjk5Ng%3D%3D;hng=CN%7Czh-CN%7CCNY%7C156;isg=BIGB_l_6uGmc9tV022IPFHRhkMtbBvfIVeEd8OPWKgjNyqCcK_6NcSJIrH4pQo3Y;l=dBxRISnmvA2VASB2BOCi5uI8Us7tLIRfguPRwd0Xi_5wE6b91J7OkNsNfFv6cjWciGLB44YRE82TreEg8PwjJ0YEae1VivepCef..;lgc=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA;mt=ci=95_1;munb=2242024317;sg=%E4%BA%BA73;skt=d1bfc909e6c0d05f;t=593a350382a4f28aa3e06c16c39febf2;tg=0;thw=cn;tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA;uc1=cookie16=V32FPkk%2FxXMk5UvIbNtImtMfJQ%3D%3D&tag=8&cookie15=W5iHLLyFOGW7aA%3D%3D&existShop=false&lng=zh_CN&cookie14=UoTbnxzMUEsTqw%3D%3D&cookie21=U%2BGCWk%2F7ow0zmglPa33heg%3D%3D&pas=0;uc3=id2=UUplY9Ft9xwldQ%3D%3D&vt3=F8dByua36wqvCAMolFQ%3D&lg2=VT5L2FSpMGV7TQ%3D%3D&nk2=rUtEoY7x%2Bk8Rxyx1ZtN%2FAg%3D%3D;uc4=nk4=0%40r7rCNeQ4%2Bj7fAj%2BMcdPH4%2B0X9xH9JxJM8Qbr&id4=0%40U2gvLJ3%2BK6kqeorNX%2BPTq0q5%2BuB6;UM_distinctid=16a81fdddac71f-0115562d66ef2c-37667603-1fa400-16a81fdddad713;unb=2242024317;v=0;alitrackid=www.taobao.com;JSESSIONID=5ADA55015D51399ADAC375C30564AA33;lastalitrackid=www.taobao.com;x5sec=7b227365617263686170703b32223a22366330383530303265666535356132316162373131613863623535326462396143503274354f3046454d6974676669703150696638514561444449794e4449774d6a517a4d5463374d673d3d227d;' # headers.update({ # 'cookie': cookie_str, # }) # # 获取到的为淘宝关键字搜索按销量排名 # params = ( # ('data-key', 'sort'), # ('data-value', 'sale-desc'), # ('ajax', 'true'), # # ('_ksTS', '1528171408340_395'), # ('callback', 'jsonp396'), # ('q', keyword[1]), # ('imgfile', ''), # ('commend', 'all'), # ('ssid', 's5-e'), # ('search_type', 'item'), # ('sourceId', 'tb.index'), # # ('spm', 'a21bo.2017.201856-taobao-item.1'), # ('ie', 'utf8'), # # ('initiative_id', 'tbindexz_20170306'), # ) # s_url = 'https://s.taobao.com/search' # body = Requests.get_url_body( # url=s_url, # headers=headers, # params=params, # ip_pool_type=self.ip_pool_type, # proxy_type=PROXY_TYPE_HTTPS, # num_retries=self.req_num_retries,) # assert body != '' # # try: # data = re.compile('\((.*)\)').findall(body)[0] # except IndexError: # self.lg.error('re获取淘宝data时出错, 出错关键字为{0}'.format(keyword[1])) # return [] # # data = json_2_dict(json_str=data, logger=self.lg) # if data == {}: # self.lg.error('获取到的淘宝搜索data为空dict! 出错关键字为{0}'.format(keyword[1])) # return [] # else: # slider_url = data.get('url', '') # if 'search/_____tmd_____/punish' in slider_url: # # 表明需要去滑动滑块 # print('需滑动滑块!') # print('滑块地址: {}'.format(slider_url)) # else: # print('无滑块') # # goods_id_list = data\ # .get('mainInfo', {})\ # .get('traceInfo', {})\ # .get('traceData', {})\ # .get('allNids', []) # if goods_id_list is None or goods_id_list == []: # self.lg.error('获取淘宝搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1])) # return [] # else: # return goods_id_list[0:20] # m站搜索也得登录, 且只有第一页, cookies中的令牌1分钟内就失效 # # 故采用三方领券网站的搜索 # # 全优惠(https://www.quanyoubuy.com/) # headers = get_random_headers( # connection_status_keep_alive=False, # cache_control='', # ) # headers.update({ # 'authority': 'www.quanyoubuy.com', # # 'referer': 'https://www.quanyoubuy.com/?m=search&a=index&k=%E5%A5%B3%E8%A3%85', # }) # url = 'https://www.quanyoubuy.com/search/index/sort/hot/k/{}.html'.format(keyword[1]) # body = Requests.get_url_body( # url=url, # headers=headers, # ip_pool_type=self.ip_pool_type, # proxy_type=PROXY_TYPE_HTTPS, # num_retries=self.req_num_retries,) # assert body != '' # # self.lg.info(body) # # title_div_list_sel = { # 'method': 'css', # 'selector': 'h3.good-title a', # } # goods_url_sel = { # 'method': 'css', # 'selector': 'a ::attr("href")', # } # title_div_List = parse_field( # parser=title_div_list_sel, # target_obj=body, # is_first=False, # logger=self.lg, # ) # goods_url_list = [] # for item in title_div_List: # try: # if '<em class=\"d-icon\"></em>' in item: # # 跳过京东优惠券 # continue # # goods_url = parse_field( # parser=goods_url_sel, # target_obj=item, # logger=self.lg, # ) # assert goods_url != '' # goods_url_list.append(goods_url) # except AssertionError: # continue # # assert goods_url_list != [] # # pprint(goods_url_list) # goods_id_sel = { # 'method': 're', # 'selector': '/iid/(\d+)\.html', # } # res = [] # for goods_url in goods_url_list: # try: # goods_id = parse_field( # parser=goods_id_sel, # target_obj=goods_url, # logger=self.lg, # ) # assert goods_id != '' # if goods_id in self.db_existed_goods_id_list: # self.lg.info('该goods_id[{}]已存在于db'.format(goods_id)) # continue # except AssertionError: # continue # # res.append(goods_id) # # new_res = [] # # 控制量只需要前25个(慢点 无所谓 太快 平台员工来不及操作 后期更新量也太大)(有部分主图不能用) # for goods_id in res[:25]: # self.lg.info('判断goods_id[{}]是否为tb商品ing...'.format(goods_id)) # if self.judge_qyh_is_tb_by_goods_id(goods_id=goods_id) != 0: # continue # # new_res.append(goods_id) # # self.lg.info('其中tb goods num: {}'.format(len(new_res))) # collect() # 爱淘宝pc 版搜索页(https://ai.taobao.com/) headers = get_random_headers( connection_status_keep_alive=False, cache_control='', ) headers.update({ 'authority': 'ai.taobao.com', # ori # 'referer': 'https://ai.taobao.com/search/index.htm?key=%E9%A3%9F%E5%93%81&pid=mm_10011550_0_0&union_lens=recoveryid%3A201_11.131.193.65_881154_1572572691432%3Bprepvid%3A201_11.131.193.65_881154_1572572691432&prepvid=200_11.27.75.93_347_1572572705164&sort=biz30day&spm=a231o.7712113%2Fj.1003.d2', 'referer': 'https://ai.taobao.com/search/index.htm?key={}&sort=biz30day', }) if sort_order == 0: self.lg.info('按销量排序') # 销量 params = ( ('key', keyword[1]), # ('pid', 'mm_10011550_0_0'), # ('union_lens', 'recoveryid:201_11.131.193.65_881154_1572572691432;prepvid:201_11.131.193.65_881154_1572572691432'), # ('prepvid', '200_11.27.75.94_1585_1572572718183'), ('sort', 'biz30day'), # ('spm', 'a231o.7712113/j.1003.d11'), ('taobao', 'true'), # 勾选仅搜索tb ) elif sort_order == 1: # 升序 self.lg.info('按升序排序') params = ( ('key', keyword[1]), ('taobao', 'true'), # 勾选仅搜索tb ('sort', 'discount_price_incr'), ) else: raise NotImplemented body = Requests.get_url_body( url='https://ai.taobao.com/search/index.htm', headers=headers, params=params, ip_pool_type=self.ip_pool_type, num_retries=self.req_num_retries, proxy_type=PROXY_TYPE_HTTPS, ) assert body != '' # self.lg.info(body) page_res = re.compile('var _pageResult = (.*?);</script>').findall(body)[0] # self.lg.info(page_res) data = json_2_dict( json_str=page_res, default_res={}, logger=self.lg,).get('result', {}).get('auction', []) # pprint(data) new_res = [] for item in data: item_id = str(item.get('itemId', '')) if item_id != '': try: this_price = float(item.get('realPrice', '0')) if this_price < 8.8: self.lg.info('该goods_id: {}, this_price: {}, 售价小于9元, pass'.format(item_id, this_price)) continue except Exception: self.lg.error('遇到错误:', exc_info=True) continue new_res.append(item_id) # boss说同一关键字重复太多, 取10个 此处取15个 return new_res[:15] @catch_exceptions_with_class_logger(default_res=-1) def judge_qyh_is_tb_by_goods_id(self, goods_id): """ 根据商品id 判断是否是tb商品 :param goods_id: :return: 0 tb|1 tm | -1 未知 """ headers = get_random_headers( connection_status_keep_alive=False, cache_control='', ) headers.update({ 'authority': 'www.quanyoubuy.com', }) url = 'https://www.quanyoubuy.com/item/index/iid/{}.html'.format(goods_id) body = Requests.get_url_body( url=url, headers=headers, ip_pool_type=self.ip_pool_type, proxy_type=PROXY_TYPE_HTTPS, num_retries=7,) assert body != '' btn_text_sel = { 'method': 'css', 'selector': 'div.product-info a.go_btn span ::text', } btn_text = parse_field( parser=btn_text_sel, target_obj=body, is_print_error=False, logger=self.lg, ) # self.lg.info(btn_text) assert btn_text != '' res = -1 if '天猫' in btn_text: self.lg.info('goods_id: {}, tm good'.format(goods_id)) res = 1 elif '淘宝' in btn_text: self.lg.info('goods_id: {}, tb good'.format(goods_id)) res = 0 else: self.lg.info('goods_id: {}, 未知 good'.format(goods_id)) pass return res def _get_1688_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取1688销量靠前的商品信息 :param keyword: :return: a list eg: ['11111', ...] ''' '''方案1: 从m.1688.com搜索页面进行抓取, 只取第一页的销量排名靠前的商品''' headers = { 'authority': 'm.1688.com', 'cache-control': 'max-age=0', 'upgrade-insecure-requests': '1', 'user-agent': get_random_pc_ua(), 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', } params = ( ('sortType', 'booked'), ('filtId', ''), ('keywords', keyword[1]), ('descendOrder', 'true'), ) url = 'https://m.1688.com/offer_search/-6161.html' body = Requests.get_url_body(url=url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) if body == '': return [] else: try: goods_id_list = Selector(text=body).css('div.list_group-item::attr("data-offer-id")').extract() # pprint(goods_id_list) except Exception as e: self.lg.exception(e) self.lg.error('获取1688搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1])) goods_id_list = [] return goods_id_list @catch_exceptions_with_class_logger(default_res=[]) def _get_tmall_goods_keywords_goods_id_list(self, keyword, sort_order=0) -> list: ''' 根据keyword获取tmall销量靠前的商品 :param keyword: :param sort_order: 排序方式 0 销量 | 1 升序排序(低到高) :return: list eg: ['//detail.tmall.com/item.htm?id=566978017832&skuId=3606684772412', ...] 不是返回goods_id ''' '''方案: tmall m站的搜索''' # 搜索: 偶尔不稳定但是还是能用 headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', # 'user-agent': get_random_pc_ua(), 'user-agent': get_random_phone_ua(), 'accept': '*/*', # 'referer': 'https://list.tmall.com/search_product.htm?q=%B0%A2%B5%CF%B4%EF%CB%B9&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=d', 'authority': 'list.tmall.com', } if sort_order == 0: self.lg.info('按销量排序') # 必须 # 'referer': 'https://list.tmall.com/search_product.htm?q=%B0%A2%B5%CF%B4%EF%CB%B9&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=d', referer = 'https://list.tmall.com/search_product.htm?q={}&type=p&spm=a220m.6910245.a2227oh.d100&from=mallfp..m_1_suggest&sort=d'.format( quote_plus(keyword[1])) # print(referer) params = { 'page_size': '20', 'page_no': '1', 'q': str(keyword[1]), 'type': 'p', 'spm': 'a220m.6910245.a2227oh.d100', 'from': 'mallfp..m_1_suggest', 'sort': 'd', } elif sort_order == 1: self.lg.info('按升序排序') # 必须 referer = 'https://list.tmall.com/search_product.htm?q={}&type=p&spm=a220m.8599659.a2227oh.d100&from=mallfp..m_1_searchbutton&searchType=default&sort=p'.format( quote_plus(keyword[1])) # print(referer) params = ( ('page_size', '20'), ('page_no', '1'), ('q', keyword[1]), ('type', 'p'), ('spm', 'a220m.8599659.a2227oh.d100'), ('from', 'mallfp..m_1_searchbutton'), ('searchType', 'default'), ('sort', 'p'), ) else: raise NotImplemented headers.update({ 'referer': referer }) s_url = 'https://list.tmall.com/m/search_items.htm' body = Requests.get_url_body( url=s_url, headers=headers, params=params, ip_pool_type=self.ip_pool_type, proxy_type=PROXY_TYPE_HTTPS, num_retries=self.req_num_retries,) assert body != '' # self.lg.info(str(body)) data = json_2_dict( json_str=body, default_res={}, logger=self.lg) assert data != {}, '获取到的天猫搜索data为空dict! 出错关键字为{0}'.format(keyword[1]) _ = data.get('item', []) assert _ != [], '获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1]) assert _ is not None, '获取天猫搜索goods_id_list为空list! 出错关键字{0}'.format(keyword[1]) # pprint(_) res = [] for item in _: try: item_url = str(item.get('url', '')) assert item_url != '' item_id = str(item.get('item_id', '')) this_price = float(item.get('price', '0')) if this_price < 8.8: self.lg.info('该goods_id: {}, this_price: {}, 售价小于9元, pass'.format(item_id, this_price)) continue post_fee = float(item.get('post_fee', '0')) if post_fee > 0: self.lg.info('该goods_id: {}不包邮, 邮费: {}, pass'.format(item_id, post_fee)) continue except Exception: self.lg.error('遇到错误[出错关键字:{}]:'.format(keyword[1]), exc_info=True) continue res.append(item_url) # boss说同一关键字重复太多, 取10个 此处取15个 return res[:15] def _get_jd_goods_keywords_goods_id_list(self, keyword): ''' 根据keyword获取京东销量靠前的商品 :param keyword: :return: [] or ['xxxx', ....] ''' # 方案1: jd m站的搜索(基于搜索接口) headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', # 'referer': 'https://so.m.jd.com/ware/search.action?keyword=b&area_ids=1,72,2819&sort_type=sort_totalsales15_desc&qp_disable=no&fdesc=%E5%8C%97%E4%BA%AC&t1=1529934870416', 'authority': 'so.m.jd.com', } params = ( ('keyword', keyword[1]), ('datatype', '1'), ('callback', 'jdSearchResultBkCbA'), ('page', '1'), ('pagesize', '10'), ('ext_attr', 'no'), ('brand_col', 'no'), ('price_col', 'no'), ('color_col', 'no'), ('size_col', 'no'), ('ext_attr_sort', 'no'), ('merge_sku', 'yes'), ('multi_suppliers', 'yes'), ('area_ids', '1,72,2819'), ('sort_type', 'sort_totalsales15_desc'), ('qp_disable', 'no'), ('fdesc', '\u5317\u4EAC'), # ('t1', '1529934992189'), ) s_url = 'https://so.m.jd.com/ware/search._m2wq_list' body = Requests.get_url_body(url=s_url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) if body == '': return [] else: try: data = re.compile('jdSearchResultBkCbA\((.*)\)').findall(body)[0] except IndexError: self.lg.error('获取jd的关键字数据时, IndexError! 出错关键字为{0}'.format((keyword[1]))) return [] '''问题在于编码中是\xa0之类的,当遇到有些 不用转义的\http之类的,则会出现以上错误。''' data = deal_with_JSONDecodeError_about_value_invalid_escape(json_str=data) data = json_2_dict(json_str=data, logger=self.lg) if data == {}: self.lg.error('获取到的天猫搜索data为空dict! 出错关键字为{0}'.format(keyword[1])) return [] else: # 注意拿到的数据如果是京东拼购则跳过 # pprint(data) data = data.get('data', {}).get('searchm', {}).get('Paragraph', []) # pingou中字段'bp'不为空即为拼购商品,抓取时不抓取拼购商品, 即'pingou_price': item.get('pinGou', {}).get('bp', '') == '' if data is not None and data != []: goods_id_list = [item.get('wareid', '') for item in data if item.get('pinGou', {}).get('bp', '') == ''] return goods_id_list else: self.lg.error('获取到的data为空list, 请检查!') return [] def _taobao_keywords_spider(self, **kwargs): ''' 抓取goods_id_list的数据,并存储 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = ['https://item.taobao.com/item.htm?id=' + item for item in goods_id_list] self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url # 用于判断某个goods是否被插入的参数 result = False try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: taobao = TaoBaoLoginAndParse(logger=self.lg, is_real_times_update_call=True) self.sql_cli = _block_get_new_db_conn( db_obj=self.sql_cli, index=self.add_goods_index, logger=self.lg, remainder=20,) if self.sql_cli.is_connect_success: goods_id = taobao.get_goods_id_from_url(item) if goods_id == '': self.lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(self.add_goods_index))) tt = taobao.get_goods_data(goods_id) data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(goods_id) data['username'] = '******' data['main_goods_id'] = None if not self.check_target_data_is_legal(target_data=data): return False result = taobao.old_taobao_goods_insert_into_new_table(data, pipeline=self.sql_cli) else: pass else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table(goods_id=goods_id, keyword_id=keyword_id) else: pass self.lg.info('该关键字的商品已经抓取完毕!') return True def _1688_keywords_spider(self, **kwargs): ''' 1688对应关键字的商品信息抓取存储 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = ['https://detail.1688.com/offer/{0}.html'.format(item) for item in goods_id_list] self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: result = False # 每次重置 try: goods_id = re.compile('offer/(.*?).html').findall(item)[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: ali_1688 = ALi1688LoginAndParse(logger=self.lg) if self.add_goods_index % 20 == 0: # 每50次重连一次,避免单次长连无响应报错 self.lg.info('正在重置,并与数据库建立新连接中...') self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self.lg.info('与数据库的新连接成功建立...') if self.sql_cli.is_connect_success: goods_id = ali_1688.get_goods_id_from_url(item) if goods_id == '': self.lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(self.add_goods_index))) tt = ali_1688.get_ali_1688_data(goods_id) if tt.get('is_delete') == 1 and tt.get('before') is False: # 处理已下架的但是还是要插入的 # 下架的商品就pass continue data = ali_1688.deal_with_data() if data != {}: data['goods_id'] = goods_id data['goods_url'] = 'https://detail.1688.com/offer/' + goods_id + '.html' data['username'] = '******' data['main_goods_id'] = None result = ali_1688.old_ali_1688_goods_insert_into_new_table(data=data, pipeline=self.sql_cli) else: pass else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 try: del ali_1688 except: pass collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table(goods_id=goods_id, keyword_id=keyword_id) else: pass self.lg.info('该关键字的商品已经抓取完毕!') return True def _tmall_keywords_spider(self, **kwargs): """ tmall对应关键字采集 :param kwargs: :return: """ goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') goods_url_list = ['https:' + re.compile('&skuId=.*').sub('', item) for item in goods_id_list] self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url # 用于判断某个goods是否被插入的参数 result = False try: goods_id = re.compile(r'id=(\d+)').findall(item)[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: tmall = TmallParse(logger=self.lg, is_real_times_update_call=True) self.sql_cli = _block_get_new_db_conn( db_obj=self.sql_cli, index=self.add_goods_index, logger=self.lg, remainder=20, ) if self.sql_cli.is_connect_success: goods_id = tmall.get_goods_id_from_url(item) if goods_id == []: self.lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = tmall.get_goods_data(goods_id) data = tmall.deal_with_data() goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data['goods_url'] = tmall._from_tmall_type_get_tmall_url(type=data['type'], goods_id=goods_id) if data['goods_url'] == '': self.lg.error('该goods_url为空值! 此处跳过!') continue if not self.check_target_data_is_legal(target_data=data): return False result = tmall.old_tmall_goods_insert_into_new_table(data, pipeline=self.sql_cli) else: pass else: self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table( goods_id=goods_id, keyword_id=keyword_id) else: pass self.lg.info('该关键字的商品已经抓取完毕!') return True def _jd_keywords_spider(self, **kwargs): ''' jd对应关键字采集 :param kwargs: :return: ''' goods_id_list = kwargs.get('goods_id_list') keyword_id = kwargs.get('keyword_id') '''初始地址可以直接用这个[https://item.jd.com/xxxxx.html]因为jd会给你重定向到正确地址, 存也可以存这个地址''' # 所以这边jd就不分类存,一律存为常规商品site_id = 7 goods_url_list = ['https://item.jd.com/{0}.html'.format(str(item)) for item in goods_id_list] self.lg.info('即将开始抓取该关键字的goods, 请耐心等待...') for item in goods_url_list: # item为goods_url result = False # 用于判断某个goods是否被插入db的参数 try: goods_id = re.compile('/(\d+)\.html').findall(item)[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) result = True # 原先存在的情况 pass else: jd = JdParse(logger=self.lg) if self.add_goods_index % 20 == 0: # 每20次重连一次,避免单次长连无响应报错 self.lg.info('正在重置,并与数据库建立新连接中...') self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self.lg.info('与数据库的新连接成功建立...') if self.sql_cli.is_connect_success: goods_id = jd.get_goods_id_from_url(item) if goods_id == []: self.lg.error('@@@ 原商品的地址为: {0}'.format(item)) continue else: self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = jd.get_goods_data(goods_id) data = jd.deal_with_data(goods_id) goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data['goods_url'] = item result = jd.old_jd_goods_insert_into_new_table(data, self.sql_cli) else: pass else: self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 sleep(1) try: del jd except: pass collect() if result: # 仅处理goods_id被插入或者原先已存在于db中 self._insert_into_goods_id_and_keyword_middle_table(goods_id=goods_id, keyword_id=keyword_id) else: pass self.lg.info('该关键字的商品已经抓取完毕!') return True @catch_exceptions_with_class_logger(default_res=False) def check_target_data_is_legal(self, target_data: dict) -> bool: """ 检查被采集的数据是否合法 :return: """ res = True if int(target_data['sell_count']) < 50: self.lg.info('该商品销量小于50, pass') res = False if float(target_data['taobao_price']) < 8.8: self.lg.info('最低价小于9元不采集, pass') res = False return res def _insert_into_goods_id_and_keyword_middle_table(self, **kwargs): ''' 数据插入goods_id_and_keyword_middle_table :param kwargs: :return: ''' goods_id = str(kwargs['goods_id']) keyword_id = int(kwargs['keyword_id']) # self.lg.info(goods_id) # self.lg.info(keyword_id) result = False '''先判断中间表goods_id_and_keyword_middle_table是否已新增该关键字的id''' # 注意非完整sql语句不用r'', 而直接'' try: _ = self.sql_cli._select_table(sql_str=kw_select_str_3, params=(goods_id,)) _ = [i[0] for i in _] # pprint(_) except Exception: self.lg.error('执行中间表goods_id_and_keyword_middle_table是否已新增该关键字的id的sql语句时出错, 跳过给商品加keyword_id') return result if keyword_id not in _: params = ( goods_id, keyword_id,) self.lg.info('------>>>| 正在插入keyword_id为{0}, goods_id为{1}'.format(params[1], params[0])) result = self.sql_cli._insert_into_table_2(sql_str=self.add_keyword_id_for_goods_id_sql_str, params=params, logger=self.lg) return result def _add_keyword_2_db_from_excel_file(self): ''' 从excel插入新关键字到db :return: ''' excel_file_path = '/Users/afa/Desktop/2018-07-18-淘宝phone-top20万.xlsx' self.lg.info('正在读取{0}, 请耐心等待...'.format(excel_file_path)) try: excel_result = read_info_from_excel_file(excel_file_path=excel_file_path) except Exception: self.lg.error('遇到错误:', exc_info=True) return False self.lg.info('读取完毕!!') self.lg.info('正在读取db中原先的keyword...') db_keywords = self.sql_cli._select_table(sql_str=kw_select_str_4) db_keywords = [i[0] for i in db_keywords] self.lg.info('db keywords 读取完毕!') for item in excel_result: keyword = item.get('关键词', None) if not keyword: continue if keyword in db_keywords: self.lg.info('该关键字{0}已经存在于db中...'.format(keyword)) continue self.lg.info('------>>>| 正在存储关键字 {0}'.format(keyword)) self.sql_cli._insert_into_table_2(sql_str=kw_insert_str_2, params=(str(keyword), 0), logger=self.lg) self.lg.info('全部写入完毕!') return True def __del__(self): try: del self.lg del self.msg del self.sql_cli except: pass try: del self.db_existed_goods_id_list except: pass collect()
class GoodsSortByShopTypeSpider2(AsyncCrawler): def __init__(self): AsyncCrawler.__init__( self, user_agent_type=PHONE, ip_pool_type=IP_POOL_TYPE, log_print=True, logger=None, log_save_path=MY_SPIDER_LOGS_PATH + '/goods_sort_by_shop_type/_/', headless=True, ) self.req_num_retries = 6 # 必须是新开m站超市页面来获取 # tm m站超市分类 url: https://chaoshi.m.tmall.com/ # tm每次使用请先更换 # 直接复制对应接口得新cookies # 并修改cp_utils中的block_calculate_tb_right_sign中的t(该t为params参数中的t), _m_h5_tk为最新请求对应的值即可 # eg: # 天猫超市入口的t, _m_h5_tk # t = '1590387891307' # _m_h5_tk = '6f594c22870353cede88c2796cc28ee9' self.tm_new_chrome_t = '1590545557798' self.tm_new_chrome_sign = 'c2d1fced4d7b1333d0f19b6b637fed9f' self.tm_new_chrome_cookies_str = 'hng=CN%7Czh-CN%7CCNY%7C156; cna=wRsVFTj6JEoCAXHXtCqXOzC7; lid=%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA; enc=MXX6theE39REQu4vFae7f5vi8A8GAdt5pdcQAJY7eR3zuOxwTSUu0zQGRWpBLbzxbJUsLvdHk4vB8ZWvQR%2BjQg%3D%3D; l=eB_zn817vA2VK0x_BOfZnurza779_IRAguPzaNbMiOCPOdfH5H0fWZAGqqTMCnGVh6uk83JDb3ZQBeYBcBdKnxvOnrZgURDmn; sm4=330100; csa=0_0_0_0_0_0_0_0_0_0_0_0_0; sgcookie=EbIdqdSy36jBPHKaO%2FPZS; uc3=id2=UUplY9Ft9xwldQ%3D%3D&lg2=W5iHLLyFOGW7aA%3D%3D&vt3=F8dBxGZjLZslLqBqC3E%3D&nk2=rUtEoY7x%2Bk8Rxyx1ZtN%2FAg%3D%3D; t=c413bd0891628c3269938122b2bee15f; tracknick=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; uc4=id4=0%40U2gvLJ3%2BK6kqeorNX%2B21sXN8x3lW&nk4=0%40r7rCNeQ4%2Bj7fAj%2BMcdPH4%2B0X9x%2FwQLp0Sd4%2F; lgc=%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA; _tb_token_=ee5587773876d; cookie2=13ed682f1aa10261d267e8e5a9e8e223; _m_h5_tk=883da77eaee1f1b25a7fb1f4c95b68e6_1590554541015; _m_h5_tk_enc=d531110c50a3daed05299dbb0b6dc3f0; isg=BBISyyg5mGC0AuZBht_2zq0HY970Ixa9xZzn1dxrPkWw77LpxLNmzRgJWw32n45V' self.tm_first_sort_list = [ { 'name': '休闲零食', 'icon_type': 'categoryxiuxianlingshi', 'level1_id': 78, }, { 'name': '粮油米面', 'icon_type': 'categoryliangyoumimian', 'level1_id': 80, }, { 'name': '乳饮酒水', 'icon_type': 'categorynaipinshuiyin', 'level1_id': 79, }, { 'name': '日用百货', 'icon_type': 'categorychufangriyong', 'level1_id': 81, }, { 'name': '母婴用品', 'icon_type': 'categorymuyingyongping', 'level1_id': 82, }, { 'name': '个人护理', 'icon_type': 'categorygerenhuli', 'level1_id': 83, }, { 'name': '纸品家清', 'icon_type': 'categoryjiaqingjiaju', 'level1_id': 84, }, { 'name': '美容护肤', 'icon_type': 'categorymeironghufu', 'level1_id': 94, }, { 'name': '方便速食', 'icon_type': 'categoryfangbiansushi', 'level1_id': 92, }, { 'name': '中外名酒', 'icon_type': 'categoryzhongwaimingjiu', 'level1_id': 87, }, { 'name': '童装童鞋', 'icon_type': 'categorytongzhuang', 'level1_id': 138, }, { 'name': '成人用品', 'icon_type': 'categorychengrenyongpin', 'level1_id': 93, }, { 'name': '家纺内衣', 'icon_type': 'categoryjiafangneiyi', 'level1_id': 90, }, { 'name': '宠物生活', 'icon_type': 'categorychongwuyongpin', 'level1_id': 91, }, { 'name': '电器数码', 'icon_type': 'category3cqipei', 'level1_id': 95, }, { 'name': '进口好货', 'icon_type': 'categoryjinkouhaohuo', 'level1_id': 85, }, { 'name': '医疗保健', 'icon_type': 'categoryzibubaojian', 'level1_id': 89, }, ] self.tm_skip_name_tuple = ( '好货', '为你推荐', '热销榜单', '每日特惠', '一件包邮', '新品尝鲜', '年中大赏', '喵九八', '新品推荐', '特惠', '尝新', '精致好货', '超值爆款', '包邮', '优选', '直播', '尖叫单品', '品牌专区', '大牌', '网红爆款', '新品', '清凉一夏', '热销', '推荐', '国家馆', '优惠', '折', '送', # eg: 买一送一 '精选', '爆款', '上新', '秒杀', '热门', '减', '满减', ) self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self._init_sql_str() self.db_existed_goods_id_list = [] self.sql_cli_remainder = 20 def _init_sql_str(self): self.sql_str0 = 'insert into dbo.common_shop_sort_level_table(unique_id, sort_level1_id, sort_level1_name, sort_level2_id, sort_level2_name, sort_level3_id, sort_level3_name, shop_id) values(%s, %s, %s, %s, %s, %s, %s, %s)' self.sql_str1 = """ select unique_id, sort_level1_id, sort_level2_id, sort_level2_name, sort_level3_id, sort_level3_name from dbo.common_shop_sort_level_table where sort_level2_id != '' and sort_level3_id != '' """ self.sql_str2 = 'insert into dbo.common_shop_sort_and_goods_relation_table(create_time, unique_id, sort_unique_id, goods_id, goods_url) values(%s, %s, %s, %s, %s)' self.sql_str3 = """ select unique_id from dbo.common_shop_sort_and_goods_relation_table """ self.sql_str4 = 'select GoodsID from dbo.GoodsInfoAutoGet' self.sql_str5 = """ select goods_id from dbo.common_shop_sort_and_goods_relation_table where sort_unique_id in ( select unique_id from dbo.common_shop_sort_level_table where shop_id='tmcs' ) """ async def _fck_run(self): # await self.get_tm_sort_info_and_2_db() # 处理待存入的tmcs goods_id await self.deal_with_tmcs_goods_sort_relation_2_goods_table() async def deal_with_tmcs_goods_sort_relation_2_goods_table(self): """ 处理common_shop_sort_and_goods_relation_table表中未存入的goods_id, 对应存入原商品表中 :return: """ while True: # 用于定位增加商品的个数 self.add_goods_index = 0 try: result0 = list( self.sql_cli._select_table(sql_str=self.sql_str4)) assert result0 is not None result1 = list( self.sql_cli._select_table(sql_str=self.sql_str5)) assert result1 is not None self.lg.info('db 已存在的goods_id_num: {}'.format(len(result0))) self.db_existed_goods_id_list = [item[0] for item in result0] assert self.db_existed_goods_id_list != [] # 获取db 中待处理存入的tmcs goods self.db_wait_2_save_goods_id_list = [ item[0] for item in result1 ] assert self.db_wait_2_save_goods_id_list != [] except Exception: self.lg.error('遇到错误:', exc_info=True) await async_sleep(15) continue try: del result0 del result1 except: pass collect() # 处理待存入的tmcs 的goods_id_list await self.deal_with_tmcs_goods_id_list() async def deal_with_tmcs_goods_id_list(self): self.lg.info('即将开始抓取tmcs goods, 请耐心等待...') for item in self.db_wait_2_save_goods_id_list: # eg: '61864164616' goods_id = item if goods_id in self.db_existed_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) continue tmall = TmallParse(logger=self.lg, is_real_times_update_call=True) self.sql_cli = _block_get_new_db_conn( db_obj=self.sql_cli, index=self.add_goods_index, logger=self.lg, remainder=self.sql_cli_remainder, ) if self.sql_cli.is_connect_success: # 加spm 是为了get_goods_id_from_url能筛选, id # goods_url = 'https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.65a47fb1yR1OUp&id={}'.format(goods_id) goods_url = 'https://detail.tmall.com/item.htm?id={}'.format( goods_id) # 下面这个goods_id为类型加goods_id的list goods_id = tmall.get_goods_id_from_url(goods_url) if goods_id == []: self.lg.error('@@@ 原商品的地址为: {0}'.format(goods_url)) continue else: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id[1], str(self.add_goods_index))) tt = tmall.get_goods_data(goods_id) data = tmall.deal_with_data() goods_id = goods_id[1] if data != {}: data['goods_id'] = goods_id data['username'] = '******' data['main_goods_id'] = None data[ 'goods_url'] = tmall._from_tmall_type_get_tmall_url( type=data['type'], goods_id=goods_id, ) if data['goods_url'] == '': self.lg.error('该goods_url为空值! 此处跳过!') continue if len(data['all_img_url']) <= 1: self.lg.info( '[goods_id: {}]主图个数<=1, pass'.format(goods_id)) return False result = tmall.old_tmall_goods_insert_into_new_table( data=data, pipeline=self.sql_cli) if result: # 避免后续重复采集 self.db_existed_goods_id_list.append(goods_id) else: pass else: pass else: self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass self.add_goods_index += 1 collect() sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) self.lg.info('tmcs已经抓取完毕!') return True def get_common_shop_sort_level_table_unique_id(self, shop_id, sort_level1_id='', sort_level2_id='', sort_level3_id='') -> str: """ 生成common_shop_sort_level_table唯一的unique_id 以避免记录重复 :param shop_id: :param sort_level1_id: :param sort_level2_id: :param sort_level3_id: :return: """ # 拼接成类似于'{}::{}::{}::{}'结构 target_str = shop_id if sort_level1_id != '': target_str = target_str + '::' + str(sort_level1_id) if sort_level2_id != '': target_str = target_str + '::' + str(sort_level2_id) if sort_level3_id != '': target_str = target_str + '::' + str(sort_level3_id) self.lg.info(target_str) return get_uuid3(target_str=target_str) async def get_tm_sort_info_and_2_db(self): """ 获取天猫的分类信息 :return: """ # 插入tmcs 第一分类的数据到db # await self._tmcs_insert_into_sort_level1_2_db() # 获取第二分类的id信息 # for item in self.tm_first_sort_list: # sort_level1_id = item.get('level1_id', 0) # sort_level1_name = item.get('name', '') # icon_type = item.get('icon_type', '') # is_success = False # target_data = {} # try: # self.lg.info('Get sort_level1_name: {}, sort_level1_id: {} ing ...'.format( # sort_level1_name, # sort_level1_id)) # target_data = await self.get_tm_second_sort_info_by_first_sort_name( # first_sort_name=sort_level1_name, # icon_type=icon_type, # level1_id=sort_level1_id,) # if target_data != {}: # is_success = True # else: # pass # assert target_data != {} # # 插入db # await self._tmcs_insert_into_sort_level2_2_db(data=target_data) # except Exception: # self.lg.error('遇到错误:', exc_info=True) # # self.lg.info('[{}] sort_level1_name: {}, sort_level1_id: {}'.format( # '+' if is_success else '-', # sort_level1_name, # sort_level1_id, # )) # # # 再获取其第三类的分类信息 # # # 测试 # # 获取第三分类的信息 # # 即上面的second_list中item的id # # second_id = 298 # # icon_type = 'categoryxiuxianlingshi' # # business = 'B2C' # # await self.get_tm_third_sort_info_by_second_id( # # second_id=second_id, # # icon_type=icon_type, # # business=business # # ) # # if target_data == {}: # continue # # for i in target_data.get('second_list', []): # is_success2 = False # try: # sort_level2_id = i.get('id', -1) # assert sort_level2_id != -1 # sort_level2_name = i.get('name', '') # assert sort_level2_name != '' # business = i.get('business', '') # assert business != '' # # self.lg.info('Get sort_level1_name: {}, sort_level1_id: {}, sort_level2_name: {}, sort_level2_id: {} ing ...'.format( # sort_level1_name, # sort_level1_id, # sort_level2_name, # sort_level2_id)) # target_data2 = await self.get_tm_third_sort_info_by_second_id( # second_id=sort_level2_id, # icon_type=icon_type, # business=business,) # if target_data2 != {}: # is_success2 = True # else: # pass # assert target_data2 != {} # # 插入db # await self._tmcs_insert_into_sort_level3_2_db( # sort_level1_id=sort_level1_id, # sort_level1_name=sort_level1_name, # sort_level2_name=sort_level2_name, # data=target_data2) # except Exception: # self.lg.error('遇到错误:', exc_info=True) # continue # # self.lg.info('[{}] sort_level2_name: {}, sort_level2_id: {}'.format( # '+' if is_success2 else '-', # sort_level2_name, # sort_level2_id, # )) # 获取第三分类对应的goods_id信息并存入db try: ori_db_data = self.sql_cli._select_table( sql_str=self.sql_str1, logger=self.lg, ) assert ori_db_data is not None # pprint(ori_db_data) # 获取common_shop_sort_and_goods_relation_table 中原先已存在的unique_id_list self.db_goods_sort_relation_unique_id_list = self.sql_cli._select_table( sql_str=self.sql_str3, logger=self.lg, ) assert self.db_goods_sort_relation_unique_id_list is not None self.db_goods_sort_relation_unique_id_list = [ i[0] for i in self.db_goods_sort_relation_unique_id_list ] # pprint(self.db_goods_sort_relation_unique_id_list) assert self.db_goods_sort_relation_unique_id_list != [] # 获取并存入对应分类的goods数据 await self._tmcs_insert_into_goods_info_2_db( ori_db_data=ori_db_data) except Exception: self.lg.error('遇到错误:', exc_info=True) async def _tmcs_insert_into_goods_info_2_db( self, ori_db_data: (tuple, list), ): """ 把对应分类的商品数据插入到中间表common_shop_sort_and_goods_relation_table中 :param ori_db_data: :return: """ # 测试 # 获取第四分类, 即直接获取最后一级分类对应的goods_id_list # second_id = 298 # third_id = 0 # icon_type = 'categoryxiuxianlingshi' # business = 'B2C' # await self.get_tm_fourth_sort_info_by_second_id_and_third_id( # second_id=second_id, # third_id=third_id, # icon_type=icon_type, # business=business # ) for item in ori_db_data: is_success = False try: sort_unique_id, sort_level1_id = item[0], int(item[1].replace( 'tmcs', '')) sort_level2_id, sort_level2_name = int(item[2].replace( 'tmcs', '')), item[3] sort_level3_id, sort_level3_name = int(item[4].replace( 'tmcs', '')), item[5] assert sort_level2_id != '' assert sort_level3_id != '' icon_type = self.get_tmcs_icon_type_by_sort_level1_id( sort_level1_id=sort_level1_id, ) self.lg.info( 'Get sort_level1_id: {}, sort_level2_name: {}, sort_level2_id: {}, sort_level3_name: {}, sort_level3_id: {} ing ...' .format( sort_level1_id, sort_level2_name, sort_level2_id, sort_level3_name, sort_level3_id, )) target_data = await self.get_tm_fourth_sort_info_by_second_id_and_third_id( second_id=sort_level2_id, third_id=sort_level3_id, icon_type=icon_type, business='B2C', ) assert target_data != {} is_success = True now_time = get_shanghai_time() # 插入db for i in target_data.get('goods_list', []): try: goods_id = i.get('goods_id', '') assert goods_id != '' goods_relation_unique_id = self.get_tmcs_goods_relation_unique_id( sort_unique_id=sort_unique_id, goods_id=goods_id, ) if goods_relation_unique_id in self.db_goods_sort_relation_unique_id_list: self.lg.info( 'db 已存在goods_relation_unique_id: {}, 跳过'. format(goods_relation_unique_id)) continue res = self.sql_cli._insert_into_table_2( sql_str=self.sql_str2, params=( now_time, goods_relation_unique_id, sort_unique_id, goods_id, '', ), logger=self.lg, ) if res: self.db_goods_sort_relation_unique_id_list.append( goods_relation_unique_id) else: pass except Exception: continue except Exception: self.lg.error('遇到错误:', exc_info=True) self.lg.info( '[{}] sort_level3_name: {}, sort_level3_id: {}'.format( '+' if is_success else '-', sort_level3_name, sort_level3_id, )) return def get_tmcs_goods_relation_unique_id(self, sort_unique_id: str, goods_id: str): """ 获取分类与商品id的唯一id, 以在db 中进行唯一去重 :param sort_unique_id: :param goods_id: :return: """ return get_uuid3(target_str=sort_unique_id + goods_id) def get_tmcs_icon_type_by_sort_level1_id(self, sort_level1_id: int) -> str: """ 根据sort_level1_id获取对应icon_type的值 :param sort_level1_id: :return: """ for item in self.tm_first_sort_list: if item.get('level1_id', -1) == sort_level1_id: return item.get('icon_type', '') else: continue raise ValueError( '获取sort_level1_id: {}, 对应的icon_type异常'.format(sort_level1_id)) async def _tmcs_insert_into_sort_level3_2_db( self, sort_level1_id, sort_level1_name, sort_level2_name, data: dict, ): """ tmcs插入第三分类等级数据 :param sort_level1_id: :param sort_level1_name: :param sort_level2_name: :param data: :return: """ try: sort_level2_id = data.get('second_id', -1) assert sort_level2_id != -1 # 转换为该等级唯一的tmcs的id sort_level1_id = self._get_unique_tmcs_sort_level_id( sort_level_id=sort_level1_id, ) sort_level2_id = self._get_unique_tmcs_sort_level_id( sort_level_id=sort_level2_id, ) assert sort_level1_name != '' assert sort_level2_name != '' for item in data.get('third_list', []): try: sort_level3_id = item.get('id', -1) assert sort_level3_id != -1 # 转换为该等级唯一的tmcs的id sort_level3_id = self._get_unique_tmcs_sort_level_id( sort_level_id=sort_level3_id, ) sort_level3_name = item.get('name', '') assert sort_level3_name != '' self.lg.info( 'sort_level3_id: {}, sort_level3_name: {}'.format( sort_level3_id, sort_level3_name, )) unique_id = self.get_common_shop_sort_level_table_unique_id( shop_id='tmcs', sort_level1_id=sort_level1_id, sort_level2_id=sort_level2_id, sort_level3_id=sort_level3_id, ) self.sql_cli._insert_into_table_2( sql_str=self.sql_str0, params=( unique_id, sort_level1_id, sort_level1_name, sort_level2_id, sort_level2_name, sort_level3_id, sort_level3_name, 'tmcs', ), logger=self.lg, ) except Exception: continue except Exception: self.lg.error('遇到错误:', exc_info=True) return async def _tmcs_insert_into_sort_level2_2_db(self, data: dict): """ tmcs插入第二分类等级数据 :return: """ try: sort_level1_id = data.get('level1_id', -1) assert sort_level1_id != -1 # 转换为该等级唯一的tmcs的id sort_level1_id = self._get_unique_tmcs_sort_level_id( sort_level_id=sort_level1_id, ) sort_level1_name = data.get('first_sort_name', '') assert sort_level1_name != '' for item in data.get('second_list', []): try: sort_level2_id = item.get('id', -1) assert sort_level2_id != -1 # 转换为该等级唯一的tmcs的id sort_level2_id = self._get_unique_tmcs_sort_level_id( sort_level_id=sort_level2_id, ) sort_level2_name = item.get('name', '') assert sort_level2_name != '' self.lg.info( 'sort_level2_id: {}, sort_level2_name: {}'.format( sort_level2_id, sort_level2_name, )) unique_id = self.get_common_shop_sort_level_table_unique_id( shop_id='tmcs', sort_level1_id=sort_level1_id, sort_level2_id=sort_level2_id, ) self.sql_cli._insert_into_table_2( sql_str=self.sql_str0, params=( unique_id, sort_level1_id, sort_level1_name, sort_level2_id, sort_level2_name, '', '', 'tmcs', ), logger=self.lg, ) except Exception: continue except Exception: self.lg.error('遇到错误:', exc_info=True) return async def _tmcs_insert_into_sort_level1_2_db(self): """ tmcs插入第一分类等级数据 :return: """ for item in self.tm_first_sort_list: sort_level1_id = item.get('level1_id', '') # 转换为该等级唯一的tmcs的id sort_level1_id = self._get_unique_tmcs_sort_level_id( sort_level_id=sort_level1_id) sort_level1_name = item.get('name', '') self.lg.info('sort_level1_id: {}, sort_level1_name: {}'.format( sort_level1_id, sort_level1_name, )) unique_id = self.get_common_shop_sort_level_table_unique_id( shop_id='tmcs', sort_level1_id=sort_level1_id, ) try: self.sql_cli._insert_into_table_2( sql_str=self.sql_str0, params=( unique_id, sort_level1_id, sort_level1_name, '', '', '', '', 'tmcs', ), logger=self.lg, ) except Exception: self.lg.error('遇到错误:', exc_info=True) continue return def _get_unique_tmcs_sort_level_id(self, sort_level_id) -> str: """ 获取tmcs的唯一分类id 标识 :param sort_level_id: :return: """ return 'tmcs' + str(sort_level_id) async def get_tm_second_sort_info_by_first_sort_name( self, first_sort_name, icon_type: str, level1_id: int): """ 根据first_sort_name来获取second分类信息 :param first_sort_name: :param icon_type: :param level1_id: :return: """ # pc 分类需登录cookies, 且老失效, pass # tm m站超市分类 url: https://chaoshi.m.tmall.com/ # 每次使用请先 headers = { 'authority': 'h5api.m.tmall.com', 'user-agent': get_random_phone_ua(), 'accept': '*/*', # 'referer': 'https://pages.tmall.com/wow/chaoshi/act/chaoshi-category?spm=a3204.12691414.201609072.d78&wh_biz=tm&wh_showError=true&iconType=categoryxiuxianlingshi&name=%E4%BC%91%E9%97%B2%E9%9B%B6%E9%A3%9F&cateId=78&version=newIcon&storeId=&disableNav=YES', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', } # 每次使用请先 # ** 必传cookies(测试发现无敏感cookie可直接从chrome中复制) # 方法(推荐): # 直接复制对应接口得新cookies, 并修改cp_utils中的block_calculate_tb_right_sign中的t(该t为params参数中的t), _m_h5_tk为最新请求对应的值即可 # cookies = { # # '_l_g_': 'Ug%3D%3D', # # 令牌得官方服务器返回, 自己无法伪造 # # '_m_h5_tk': 'd7ad6d69cf119b053cf88936309cfc96_1590377299017', # # '_m_h5_tk_enc': '7723c851b571ed56b57a314a8446ea99', # '_m_h5_tk': '9d2d854027bf7cb19b47642e47e83e6c_1590382583798', # '_m_h5_tk_enc': '01ac5d6c8604ff0413af5c0ddcbfc1f3', # '_tb_token_': 'e5339a70ef7f4', # 'cookie17': 'UUplY9Ft9xwldQ%3D%3D', # 'cookie2': '169a1fd5707fc53066e20184c77dd949', # 't': '49edda27be5a68434c5899c297529ebb', # # '_nk_': '%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA', # # 'cna': 'wRsVFTj6JEoCAXHXtCqXOzC7', # # 'cookie1': 'UR3Wq2iKhDJHTTOd%2FGn4oh0oxwBK8EUqK%2Bm%2Bxv62FEM%3D', # # 'csa': '7870450516_0_30.180482.120.21383_0_0_0_330108_107_110_0_236686073_330108001_0', # # 'csg': '0cc7d64f', # # 'dnk': '%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA', # # 'enc': 'MXX6theE39REQu4vFae7f5vi8A8GAdt5pdcQAJY7eR3zuOxwTSUu0zQGRWpBLbzxbJUsLvdHk4vB8ZWvQR%2BjQg%3D%3D', # # 'hng': 'CN%7Czh-CN%7CCNY%7C156', # # 'isg': 'BEZGL3omZJqhYDIl6tv6ojn7lzrIp4ph0VBT8TBvMmlEM-ZNmDfacSzFDylam4J5', # # 'l': 'eB_zn817vA2VKUQxBOfwourza77OSIRAguPzaNbMiOCPOd5B5-j1WZAMqxL6C3GVh649R3JDb3ZQBeYBc3K-nxvtpdcXq3Dmn', # # 'lgc': '%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA', # # 'lid': '%E6%88%91%E6%98%AF%E5%B7%A5%E5%8F%B79527%E6%9C%AC%E4%BA%BA', # # 'login': '******', # # 'sg': '%E4%BA%BA73', # # 'sm4': '330108', # # 'tracknick': '%5Cu6211%5Cu662F%5Cu5DE5%5Cu53F79527%5Cu672C%5Cu4EBA', # # 'uc1': 'pas', # # 'uc3': 'id2', # # 'uc4': 'id4', # # 'unb': '2242024317', # } headers.update({ # 'cookie': dict_cookies_2_str(cookies), # 测试发现无敏感cookie可直接从chrome中复制 'cookie': self.tm_new_chrome_cookies_str, }) data = dumps({ "smAreaId": 330100, "csaInfo": "0_0_0_0_0_0_0_0_0_0_0_0_0", "csa": "0_0_0_0_0_0_0_0_0_0_0_0_0", "iconType": icon_type, "level1Id": str(level1_id), }) params = ( ('jsv', '2.5.1'), ('appKey', '12574478'), ('t', self.tm_new_chrome_t), # eg: '1590379653365' ('sign', self.tm_new_chrome_sign ), # eg: 'f0e252789605777cb36b6d99ce41ee7c' ('api', 'mtop.chaoshi.aselfshoppingguide.category.level1'), ('v', '1.0'), ('type', 'jsonp'), ('dataType', 'jsonp'), ('callback', 'mtopjsonp2'), # ('data', '{"smAreaId":330108,"csaInfo":"7870450516_0_30.180482.120.21383_0_0_0_330108_107_110_0_236686073_330108001_0","csa":"7870450516_0_30.180482.120.21383_0_0_0_330108_107_110_0_236686073_330108001_0","iconType":"categoryxiuxianlingshi","level1Id":"78"}'), ('data', data), ) base_url = 'https://h5api.m.tmall.com/h5/mtop.chaoshi.aselfshoppingguide.category.level1/1.0/' # 测试发现只需请求第一次即可获取到数据 result0 = await get_taobao_sign_and_body( base_url=base_url, headers=headers, params=tuple_or_list_params_2_dict_params(params=params), data=data, ip_pool_type=self.ip_pool_type, proxy_type=PROXY_TYPE_HTTPS, logger=self.lg, ) # self.lg.info(str(result0)) _m_h5_tk, body = result0[0], result0[2] assert body != '' # self.lg.info(_m_h5_tk) # self.lg.info(body) # assert _m_h5_tk != '' # _m_h5_tk, _session1, body = block_get_tb_sign_and_body( # base_url=base_url, # headers=headers, # params=tuple_or_list_params_2_dict_params(params=params), # data=data, # _m_h5_tk=_m_h5_tk, # session=result0[1], # ip_pool_type=tri_ip_pool, # # proxy_type=PROXY_TYPE_HTTPS, # ) # self.lg.info(body) data = json_2_dict( json_str=re.compile('\((.*)\)').findall(body)[0], default_res={}, logger=self.lg, ).get('data', {}).get('data', {}) assert data != {} # pprint(data) try: data['banner'] = [] except Exception: pass second_list = [] for item in data.get('secondList', []): # self.lg.info(item) try: _id = item.get('id', '') assert _id != '' name = item.get('text', '') assert name != '' # 过滤非通用分类 assert name not in self.tm_skip_name_tuple for i in self.tm_skip_name_tuple: if i in name: # 处理eg: '特惠', '尝新' 字眼 raise ValueError('出现跳过字眼, pass') else: pass business = item.get('business', '') assert business != '' except Exception: continue second_list.append({ 'id': _id, 'name': name, 'business': business, }) res = { 'first_sort_name': first_sort_name, 'icon_type': icon_type, 'level1_id': level1_id, 'second_list': second_list, } pprint(res) return res async def get_tm_third_sort_info_by_second_id( self, second_id: int, icon_type: str, business: str = 'B2C', ): """ 根据second_sort_id来获取third分类信息 :param second_id: :param icon_type: :param business: :return: """ headers = { 'authority': 'h5api.m.tmall.com', 'user-agent': get_random_phone_ua(), 'accept': '*/*', # 'referer': 'https://pages.tmall.com/wow/chaoshi/act/chaoshi-category?spm=a3204.12691414.201609072.d78&wh_biz=tm&wh_showError=true&iconType=categoryxiuxianlingshi&name=%E4%BC%91%E9%97%B2%E9%9B%B6%E9%A3%9F&cateId=78&version=newIcon&storeId=&disableNav=YES', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', # 测试发现无敏感cookie可直接从chrome中复制 'cookie': self.tm_new_chrome_cookies_str, } data = dumps({ "smAreaId": 330100, "csaInfo": "0_0_0_0_0_0_0_0_0_0_0_0_0", "csa": "0_0_0_0_0_0_0_0_0_0_0_0_0", "iconType": icon_type, "level2Id": str(second_id), "business": business, }) params = ( ('jsv', '2.5.1'), ('appKey', '12574478'), ('t', self.tm_new_chrome_t), # eg: '1590379653365' ('sign', self.tm_new_chrome_sign ), # eg: 'f0e252789605777cb36b6d99ce41ee7c' ('api', 'mtop.chaoshi.aselfshoppingguide.category.level2'), ('v', '1.0'), ('type', 'jsonp'), ('dataType', 'jsonp'), ('callback', 'mtopjsonp7'), # ('data', '{"smAreaId":330108,"csaInfo":"7870450516_0_30.180482.120.21383_0_0_0_330108_107_110_0_236686073_330108001_0","csa":"7870450516_0_30.180482.120.21383_0_0_0_330108_107_110_0_236686073_330108001_0","iconType":"categoryxiuxianlingshi","level1Id":"78"}'), ('data', data), ) base_url = 'https://h5api.m.tmall.com/h5/mtop.chaoshi.aselfshoppingguide.category.level2/1.0/' # 测试发现只需请求第一次即可获取到数据 result0 = await get_taobao_sign_and_body( base_url=base_url, headers=headers, params=tuple_or_list_params_2_dict_params(params=params), data=data, ip_pool_type=self.ip_pool_type, proxy_type=PROXY_TYPE_HTTPS, logger=self.lg, ) # self.lg.info(str(result0)) _m_h5_tk, body = result0[0], result0[2] assert body != '' # self.lg.info(_m_h5_tk) # self.lg.info(body) # assert _m_h5_tk != '' # _m_h5_tk, _session1, body = block_get_tb_sign_and_body( # base_url=base_url, # headers=headers, # params=tuple_or_list_params_2_dict_params(params=params), # data=data, # _m_h5_tk=_m_h5_tk, # session=result0[1], # ip_pool_type=tri_ip_pool, # # proxy_type=PROXY_TYPE_HTTPS, # ) # self.lg.info(body) data = json_2_dict( json_str=re.compile('\((.*)\)').findall(body)[0], default_res={}, logger=self.lg, ).get('data', {}).get('data', {}) assert data != {} # pprint(data) try: data['banner'] = [] except Exception: pass third_list = [] for item in data.get('thrirdList', []): # self.lg.info(item) try: _id = item.get('id', '') assert _id != '' name = item.get('text', '') assert name != '' # 过滤非通用分类 assert name not in self.tm_skip_name_tuple for i in self.tm_skip_name_tuple: # self.lg.info(name) if i in name: # 处理eg: '特惠', '尝新' 字眼 raise ValueError('出现跳过字眼, pass') else: pass except Exception: continue third_list.append({ 'id': _id, 'name': name, }) res = { 'second_id': second_id, 'third_list': third_list, } pprint(res) return res async def get_tm_fourth_sort_info_by_second_id_and_third_id( self, second_id: int, third_id: int, icon_type: str, business: str = 'B2C', ): """ 根据second_sort_id来获取third分类信息 :param second_id: :param third_id: :param icon_type: :param business: :return: """ headers = { 'authority': 'h5api.m.tmall.com', 'user-agent': get_random_phone_ua(), 'accept': '*/*', # 'referer': 'https://pages.tmall.com/wow/chaoshi/act/chaoshi-category?spm=a3204.12691414.201609072.d78&wh_biz=tm&wh_showError=true&iconType=categoryxiuxianlingshi&name=%E4%BC%91%E9%97%B2%E9%9B%B6%E9%A3%9F&cateId=78&version=newIcon&storeId=&disableNav=YES', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', # 测试发现无敏感cookie可直接从chrome中复制 'cookie': self.tm_new_chrome_cookies_str, } data = dumps({ "smAreaId": 330100, "csaInfo": "0_0_0_0_0_0_0_0_0_0_0_0_0", "csa": "0_0_0_0_0_0_0_0_0_0_0_0_0", "iconType": icon_type, "level2Id": str(second_id), 'level3Id': str(third_id), 'index': 50, 'pageSize': 20, "business": business, }) params = ( ('jsv', '2.5.1'), ('appKey', '12574478'), ('t', self.tm_new_chrome_t), # eg: '1590379653365' ('sign', self.tm_new_chrome_sign ), # eg: 'f0e252789605777cb36b6d99ce41ee7c' ('api', 'mtop.chaoshi.aselfshoppingguide.category.level3'), ('v', '1.0'), ('type', 'jsonp'), ('dataType', 'jsonp'), ('callback', 'mtopjsonp7'), # ('data', '{"smAreaId":330108,"csaInfo":"7870450516_0_30.180482.120.21383_0_0_0_330108_107_110_0_236686073_330108001_0","csa":"7870450516_0_30.180482.120.21383_0_0_0_330108_107_110_0_236686073_330108001_0","iconType":"categoryxiuxianlingshi","level1Id":"78"}'), ('data', data), ) base_url = 'https://h5api.m.tmall.com/h5/mtop.chaoshi.aselfshoppingguide.category.level3/1.0/' # 测试发现只需请求第一次即可获取到数据 result0 = await get_taobao_sign_and_body( base_url=base_url, headers=headers, params=tuple_or_list_params_2_dict_params(params=params), data=data, ip_pool_type=self.ip_pool_type, proxy_type=PROXY_TYPE_HTTPS, logger=self.lg, ) # self.lg.info(str(result0)) _m_h5_tk, body = result0[0], result0[2] assert body != '' # self.lg.info(_m_h5_tk) # self.lg.info(body) # assert _m_h5_tk != '' # _m_h5_tk, _session1, body = block_get_tb_sign_and_body( # base_url=base_url, # headers=headers, # params=tuple_or_list_params_2_dict_params(params=params), # data=data, # _m_h5_tk=_m_h5_tk, # session=result0[1], # ip_pool_type=tri_ip_pool, # # proxy_type=PROXY_TYPE_HTTPS, # ) # self.lg.info(body) data = json_2_dict( json_str=re.compile('\((.*)\)').findall(body)[0], default_res={}, logger=self.lg, ).get('data', {}).get('data', {}) assert data != {} # pprint(data) try: data['banner'] = [] except Exception: pass goods_list = [] for item in data.get('itemList', {}).get('itemAndContentList', []): # self.lg.info(item) try: goods_id = item.get('itemId', '') assert goods_id != '' title = item.get('shortTitle', '') assert title != '' except Exception: continue goods_list.append({ 'goods_id': str(goods_id), 'title': title, }) res = { 'second_id': second_id, 'third_id': third_id, 'goods_list': goods_list, } pprint(res) return res def __del__(self): try: del self.lg del self.sql_cli del self.db_existed_goods_id_list except: pass collect()
class ZWMSpider(AsyncCrawler): def __init__(self): AsyncCrawler.__init__( self, ip_pool_type=IP_POOL_TYPE, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/zwm/_/', ) self.init_zwm_pwd() self.concurrency = 20 self.num_retries = 6 self.max_transaction_details_page_num = 20 # 交易截止抓取页 self.max_business_settlement_records_page_num = 20 # 商户结算记录截止抓取页 self.max_business_manage_page_num = 80 # 商户及门店管理截止抓取页(单数据也超过此数量就得进行修改) self.login_cookies_dict = {} self.sleep_time = 5 def init_zwm_pwd(self): ori_data = '' with open(ZWM_PWD_PATH, 'r') as f: for line in f: ori_data += line.replace('\n', '').replace(' ', '') data = json_2_dict( json_str=ori_data, logger=self.lg, default_res={},) self.zwm_username, self.zwm_pwd = data['username'], data['pwd'] assert self.zwm_username != '' and self.zwm_pwd != '' async def _fck_run(self) -> None: while True: try: login_res = await self._login() assert login_res is True, '登录失败, 退出后续同步操作!' # 获取所有交易明细(自己有接口, 不需要了) # all_transaction_details = await self._get_all_transaction_details() # pprint(all_transaction_details) # self.lg.info('len_all_transaction_details: {}'.format(len(all_transaction_details))) # await self._wash_and_save_all_transaction_details(target_list=all_transaction_details) # 获取所有商户结算记录 self.lg.info('获取所有商户结算记录...') all_business_settlement_records = await self._get_all_business_settlement_records_by_something() # pprint(all_business_settlement_records) self.lg.info('len_now_business_settlement_records: {}'.format(len(all_business_settlement_records))) await self._wash_save_all_business_settlement_records(target_list=all_business_settlement_records) self.lg.info('\n') # 获取所有商户及门店管理记录 self.lg.info('获取所有商户及门店管理记录 ...') all_business_manage_records = await self._get_all_business_manage_records_by_something() # pprint(all_business_manage_records) self.lg.info('len_all_business_manage_records: {}'.format(len(all_business_manage_records))) await self._wash_save_all_business_manage_records(target_list=all_business_manage_records) self.lg.info('\n') except Exception: self.lg.error('遇到错误:', exc_info=True) self.lg.info('## 同步完成 ##') self.lg.info('休眠 {} minutes ...'.format(self.sleep_time)) # 定时 await async_sleep(60 * self.sleep_time) async def _login(self) -> bool: """ 登录 :return: """ headers = await self._get_random_pc_headers() headers.update({ 'Referer': 'https://agent.yrmpay.com/JHAdminConsole/loginNew.jsp', }) file_load = { 'loginName': self.zwm_username, 'userPassword': self.zwm_pwd, } m = MultipartEncoder(fields=file_load) # self.lg.info(m) headers.update({ 'Content-Type': m.content_type }) login_url = 'https://agent.yrmpay.com/JHAdminConsole/foreigncard/permissionsLogin.do' with session() as _session: try: response = _session.post( url=login_url, headers=headers, data=m, proxies=self._get_proxies(),) login_res = json_2_dict( json_str=response.text, default_res={}, logger=self.lg, ).get('message', '') assert login_res == '登录成功', '登录失败!' self.lg.info(login_res) self.login_cookies_dict = response.cookies.get_dict() assert self.login_cookies_dict != {}, 'self.login_cookies_dict != 空dict!' # pprint(self.login_cookies_dict) except Exception: self.lg.error('遇到错误:', exc_info=True) return False return True async def _wash_save_all_business_manage_records(self, target_list: list): """ 清洗并存储所有未存储的 or 更新所有已存储的business manage records :param target_list: :return: """ all_res = [] for item in target_list: try: now_time = get_shanghai_time() create_time, modify_time, approval_status_change_time = now_time, now_time, now_time agent_name = item['agentName'] top_agent_name = item['topAgentName'] shop_type = item['merType'] is_high_quality_shop = item['isHighQualityMer'] if is_high_quality_shop == '否': is_high_quality_shop = 0 elif is_high_quality_shop == '是': is_high_quality_shop = 1 else: raise ValueError('is_high_quality_shop value: {} 异常!'.format(is_high_quality_shop)) shop_id = item.get('jhmid', '') assert shop_id != '' shop_chat_name = item.get('merchantName', '') assert shop_chat_name != '' phone_num = item.get('phone', '') assert phone_num != '' shop_chant_num = int(item['merchantNum']) sale = item['sale'] is_real_time = 0 if item['isRealTime'] == '未开通' else 1 approve_date = date_parse(item['approveDate']) rate = Decimal(item['rate']).__round__(4) account_type = item['accType'] apply_time = date_parse(item['applyTime']) # 可为空值 process_context = item.get('processContext', '') is_non_contact = 0 if item['isNonContact'] == '未开通' else 1 approval_status = item['approvalStatus'] if approval_status == '待审核': approval_status = 1 elif approval_status == '审核通过': approval_status = 0 elif approval_status == '退回': approval_status = 2 else: raise ValueError('approval_status value: {} 异常'.format(approval_status)) # 用其原值为定值不变, 且唯一 unique_id = item['id'] except Exception: self.lg.error('遇到错误:', exc_info=True) continue zwm_item = ZWMBusinessManageRecordItem() zwm_item['unique_id'] = unique_id zwm_item['create_time'] = create_time zwm_item['modify_time'] = modify_time zwm_item['agent_name'] = agent_name zwm_item['top_agent_name'] = top_agent_name zwm_item['shop_type'] = shop_type zwm_item['is_high_quality_shop'] = is_high_quality_shop zwm_item['shop_id'] = shop_id zwm_item['shop_chat_name'] = shop_chat_name zwm_item['phone_num'] = phone_num zwm_item['shop_chant_num'] = shop_chant_num zwm_item['sale'] = sale zwm_item['is_real_time'] = is_real_time zwm_item['approve_date'] = approve_date zwm_item['rate'] = rate zwm_item['account_type'] = account_type zwm_item['apply_time'] = apply_time zwm_item['process_context'] = process_context zwm_item['is_non_contact'] = is_non_contact zwm_item['approval_status'] = approval_status zwm_item['approval_status_change_time'] = approval_status_change_time all_res.append(dict(zwm_item)) # 查看 # if shop_id == 'YRMPAY100038574': # if phone_num == '18192242001': # if shop_chat_name == '哇哇叫': # pprint(dict(zwm_item)) # pprint(all_res) await self._insert_or_update_shop_manage_records_table(all_res=all_res) try: del all_res except: pass return None async def _insert_or_update_shop_manage_records_table(self, all_res: list): """ 插入or update原数据 :param all_res: :return: """ self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: db_data = self.sql_cli._select_table( sql_str=zwm_select_str_2, params=None, logger=self.lg, ) # pprint(db_data) db_unique_id_list = [item[0] for item in db_data] assert db_unique_id_list != [], 'db_unique_id_list != []' self.lg.info('len_db_unique_id_list: {}'.format(len(db_unique_id_list))) except Exception: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self.lg.error('遇到错误:', exc_info=True) return None new_add_count = 0 for item in all_res: unique_id = item['unique_id'] if unique_id not in db_unique_id_list: # 插入 self.lg.info('inserting unique_id: {} ...'.format(unique_id)) params = await self._get_insert_item_params2(item=item) try: res = self.sql_cli._insert_into_table_2( sql_str=zwm_insert_str_2, params=params, logger=self.lg) if res: new_add_count += 1 except Exception: self.lg.error('遇到错误:', exc_info=True) continue else: db_old_approval_status, db_old_approval_status_change_time = await self._get_dd_old_approval_status_and_approval_status_change_time( db_data=db_data, unique_id=unique_id,) item['approval_status_change_time'] = await self._get_new_approval_status_change_time( db_old_approval_status=db_old_approval_status, db_old_approval_status_change_time=db_old_approval_status_change_time, new_approval_status=item['approval_status'], new_approval_status_change_time=item['approval_status_change_time']) # 更新 self.lg.info('updating unique_id: {} ...'.format(unique_id)) params = await self._get_update_item_params(item=item) try: res = self.sql_cli._update_table_2( sql_str=zwm_update_str_1, params=params, logger=self.lg) except Exception: self.lg.error('遇到错误:', exc_info=True) continue if not self.sql_cli.is_connect_success: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() else: pass try: del db_data del db_unique_id_list except: pass self.lg.info('table.zwm_buss_manage_records新增个数: {}'.format(new_add_count)) async def _get_new_approval_status_change_time(self, db_old_approval_status, db_old_approval_status_change_time, new_approval_status, new_approval_status_change_time): """ 获取新的approval_status_change_time :return: """ if db_old_approval_status_change_time is not None: new_approval_status_change_time = db_old_approval_status_change_time \ if db_old_approval_status == new_approval_status \ else get_shanghai_time() else: pass return new_approval_status_change_time async def _get_dd_old_approval_status_and_approval_status_change_time(self, db_data: list, unique_id: str) -> tuple: """ 获取db 原先的approval_status :param db_data: :param unique_id: :return: """ for item in db_data: if unique_id == item[0]: return item[1], item[2] else: continue async def _get_all_business_manage_records_by_something(self,): """ 获取所有商户及门店管理记录 :return: """ async def get_tasks_params_list(max_business_manage_page_num) -> list: """获取tasks_params_list""" tasks_params_list = [] for page_num in range(1, max_business_manage_page_num): tasks_params_list.append({ 'page_num': page_num, }) return tasks_params_list def get_create_task_msg(k) -> str: return 'create task[where page_num: {}]...'.format(k['page_num']) def get_now_args(k) -> list: return [ k['page_num'], ] res = await get_or_handle_target_data_by_task_params_list( loop=self.loop, tasks_params_list=await get_tasks_params_list( max_business_manage_page_num=self.max_business_manage_page_num), func_name_where_get_create_task_msg=get_create_task_msg, func_name=self._get_one_page_business_manage_records_by_something, func_name_where_get_now_args=get_now_args, func_name_where_handle_one_res=None, func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res, one_default_res=[], step=self.concurrency, logger=self.lg, get_all_res=True,) return res @catch_exceptions_with_class_logger(default_res=[]) def _get_one_page_business_manage_records_by_something(self, page_num: int, start_date: str = None, end_date: str = None,): """ 获取单页商户及门店管理记录 :param page_num: :param start_date: 默认设置前一个月27号, eg: '2019-01-27 00:00' :param end_date: eg: '2019-07-20 09:39' :return: """ # todo 获取最开始->至今的, 即采集所有, 避免老店铺的审核状态变动, 而后台无法同步状态, 审核时间 # start_date = str(self.get_1_on_the_month() if start_date is None else start_date).split(' ')[0] + ' 00:00' start_date = '2018-01-01 00:00' end_date = (str(get_shanghai_time()) if end_date is None else end_date)[0:16] self.lg.info('start_date: {}, end_date: {}'.format(start_date, end_date)) headers = self.get_random_pc_headers() headers.update({ 'Accept': '*/*', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'https://agent.yrmpay.com/JHAdminConsole/merchantMaterial/page.do', 'X-Requested-With': 'XMLHttpRequest', }) params = ( ('_dc', get_now_13_bit_timestamp()), ) data = { 'merchantCode': '', 'accType': '', 'phone': '', 'approveDate': '', 'merchantName': '', 'processStatus': '', 'startTime': start_date, 'endTime': end_date, 'agentName': '', 'page': str(page_num), 'start': str((page_num - 1) * 100), # 开始位置0, 100, 200 'limit': '100', } url = 'https://agent.yrmpay.com/JHAdminConsole/merchantMaterial/materialList.do' body = Requests.get_url_body( method='post', url=url, headers=headers, params=params, cookies=self.login_cookies_dict, data=data, ip_pool_type=self.ip_pool_type, num_retries=self.num_retries,) assert body != '', 'body不为空值!' res = json_2_dict( json_str=body, logger=self.lg, default_res={}).get('materialList', []) self.lg.info('[{}] page_num: {}'.format( '+' if res != [] else '-', page_num, )) return res async def _wash_save_all_business_settlement_records(self, target_list): """ 清洗并存储 未被存储的所有商户结算记录 :param target_list: :return: """ self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() try: db_data = self.sql_cli._select_table( sql_str=zwm_select_str_1, params=None, logger=self.lg,) # pprint(db_data) db_unique_id_list = [item[0] for item in db_data] assert db_unique_id_list != [], 'db_unique_id_list != []' self.lg.info('len_db_unique_id_list: {}'.format(len(db_unique_id_list))) except Exception: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() self.lg.error('遇到错误:', exc_info=True) return None all_res = [] for item in target_list: # pprint(item) try: create_time = get_shanghai_time() shop_name = item.get('merName', '') assert shop_name != '' shop_id = item.get('mid', '') assert shop_id != '' agent_name = item['agentName'] top_agent_name = item['topAgentName'] date_settle_type = item['settleType'] trans_amount = item.get('transAmt', '') assert trans_amount != '' trans_amount = Decimal(trans_amount).__round__(2) service_charge = Decimal(item['mda']).__round__(2) accounting_amount = Decimal(item['mnamt']).__round__(2) # 正常情况为: '20190704', 异常为'20190824-20190824' txn_day = item['txnDay'] if re.compile('-').findall(txn_day) != []: txn_day = txn_day.split('-')[0] else: pass trans_date = date_parse(txn_day) trans_status = item['status'] if trans_status == '已结算': trans_status = 0 else: raise ValueError('trans_status: {}, 未知交易状态!'.format(trans_status)) settle_type = item['type'] settle_date = date_parse(item['minDay']) # 生成唯一标识码 unique_id = get_uuid3( target_str=shop_id + str(date_settle_type) + str(trans_amount) + \ str(service_charge) + str(trans_date) + \ str(settle_type) + str(settle_date),) except Exception: self.lg.error('遇到错误:', exc_info=True) continue if unique_id in db_unique_id_list: # self.lg.info('该record[unique_id: {}]已存在!'.format(unique_id)) continue settle_record_item = ZWMBusinessSettlementRecordItem() settle_record_item['unique_id'] = unique_id settle_record_item['create_time'] = create_time settle_record_item['shop_name'] = shop_name settle_record_item['shop_id'] = shop_id settle_record_item['agent_name'] = agent_name settle_record_item['top_agent_name'] = top_agent_name settle_record_item['date_settle_type'] = date_settle_type settle_record_item['trans_amount'] = trans_amount settle_record_item['service_charge'] = service_charge settle_record_item['accounting_amount'] = accounting_amount settle_record_item['trans_date'] = trans_date settle_record_item['trans_status'] = trans_status settle_record_item['settle_type'] = settle_type settle_record_item['settle_date'] = settle_date all_res.append(dict(settle_record_item)) # pprint(all_res) self.lg.info('未存储个数: {}'.format(len(all_res))) await self._save_all_business_settlement_records(all_res=all_res) try: del all_res except: pass return None async def _save_all_business_settlement_records(self, all_res) -> None: """ 存储新增的商家提现记录 :param all_res: :return: """ new_add_count = 0 for item in all_res: # 处理未存储的新数据 unique_id = item['unique_id'] self.lg.info('saving unique_id: {} ...'.format(unique_id)) params = await self._get_insert_item_params(item=item) try: res = self.sql_cli._insert_into_table_2( sql_str=zwm_insert_str_1, params=params, logger=self.lg) if res: new_add_count += 1 except Exception: self.lg.error('遇到错误:', exc_info=True) continue if not self.sql_cli.is_connect_success: self.sql_cli = SqlServerMyPageInfoSaveItemPipeline() else: pass self.lg.info('新增个数: {}'.format(new_add_count)) return None async def _get_insert_item_params(self, item) -> tuple: """ 待插入对象 :param item: :return: """ return tuple([ item['unique_id'], item['create_time'], item['shop_name'], item['shop_id'], item['agent_name'], item['top_agent_name'], item['date_settle_type'], item['trans_amount'], item['service_charge'], item['accounting_amount'], item['trans_date'], item['trans_status'], item['settle_type'], item['settle_date'], ]) async def _get_insert_item_params2(self, item) -> tuple: """ 待插入对象, zwm_buss_manage_records table :param item: :return: """ return tuple([ item['unique_id'], item['create_time'], item['modify_time'], item['agent_name'], item['top_agent_name'], item['shop_type'], item['is_high_quality_shop'], item['shop_id'], item['shop_chat_name'], item['phone_num'], item['shop_chant_num'], item['sale'], item['is_real_time'], item['approve_date'], item['rate'], item['account_type'], item['apply_time'], item['process_context'], item['is_non_contact'], item['approval_status'], item['approval_status_change_time'], ]) async def _get_update_item_params(self, item: dict) -> tuple: """ 更新对象, zwm_buss_manage_records table :param item: :return: """ return tuple([ item['modify_time'], item['agent_name'], item['top_agent_name'], item['shop_type'], item['is_high_quality_shop'], item['shop_id'], item['shop_chat_name'], item['phone_num'], item['shop_chant_num'], item['sale'], item['is_real_time'], item['approve_date'], item['rate'], item['account_type'], item['apply_time'], item['process_context'], item['is_non_contact'], item['approval_status'], item['approval_status_change_time'], item['unique_id'], ]) async def _wash_and_save_all_transaction_details(self, target_list: list): """ 清洗并存储所有交易明细 :param target_list: :return: """ pass async def _get_all_business_settlement_records_by_something(self): """ 获取所有商户结算记录 :return: """ async def get_tasks_params_list(max_business_settlement_records_page_num) -> list: """获取tasks_params_list""" tasks_params_list = [] for page_num in range(1, max_business_settlement_records_page_num): tasks_params_list.append({ 'page_num': page_num, }) return tasks_params_list def get_create_task_msg(k) -> str: return 'create task[where page_num: {}]...'.format(k['page_num']) def get_now_args(k) -> list: return [ k['page_num'], ] res = await get_or_handle_target_data_by_task_params_list( loop=self.loop, tasks_params_list=await get_tasks_params_list( max_business_settlement_records_page_num=self.max_business_settlement_records_page_num), func_name_where_get_create_task_msg=get_create_task_msg, func_name=self._get_one_page_business_settlement_records_by_something, func_name_where_get_now_args=get_now_args, func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res, one_default_res=[], step=self.concurrency, logger=self.lg, get_all_res=True,) return res @catch_exceptions_with_class_logger(default_res=[]) def _get_one_page_business_settlement_records_by_something(self, page_num :int, start_date: str=None, end_date: str=None, mid: str='', agent_name: str='') -> list: """ 得到单页商户结算记录 :param page_num: :param start_date: 默认设置前一个月27号, eg: '2019-07-01' :param end_date: eg: '2019-07-16' :param mid: 商户编号 :param agent_name: 顶级机构名称 :return: """ start_date = str(self.get_1_on_the_month() if start_date is None else start_date).split(' ')[0] # start_date = '2018-01-01' end_date = (str(get_shanghai_time()) if end_date is None else end_date).split(' ')[0] self.lg.info('start_date: {}, end_date: {}'.format(start_date, end_date)) headers = self.get_random_pc_headers() headers.update({ 'Accept': '*/*', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'https://agent.yrmpay.com/JHAdminConsole/merSettle/querySettleJsp.do', 'X-Requested-With': 'XMLHttpRequest', }) params = ( ('_dc', get_now_13_bit_timestamp()), ) data = { 'startDate': start_date, 'endDate': end_date, 'mid': mid, 'agentName': agent_name, 'loginAgentId': self.zwm_username[0:8], # 前8位 'page': str(page_num), 'start': str((page_num - 1) * 100), # 开始位置, 0, 100, 200 'limit': '100', } url = 'https://agent.yrmpay.com/JHAdminConsole/merSettle/queryMerSettleList.do' body = Requests.get_url_body( method='post', url=url, headers=headers, params=params, cookies=self.login_cookies_dict, data=data, ip_pool_type=self.ip_pool_type, num_retries=self.num_retries,) # self.lg.info(body) assert body != '', 'body不为空值!' res = json_2_dict( json_str=body, logger=self.lg, default_res={}).get('data', []) self.lg.info('[{}] page_num: {}'.format( '+' if res != [] else '-', page_num, )) return res async def _get_all_transaction_details(self) -> list: """ 获取所有交易流水 :return: """ async def _get_tasks_params_list() -> list: """获取tasks_params_list""" tasks_params_list = [] for page_num in range(1, self.max_transaction_details_page_num): tasks_params_list.append({ 'page_num': page_num, }) return tasks_params_list tasks_params_list = await _get_tasks_params_list() tasks_params_list_obj = TasksParamsListObj( tasks_params_list=tasks_params_list, step=self.concurrency,) all_res = [] while True: try: slice_params_list = tasks_params_list_obj.__next__() except AssertionError: break tasks = [] for k in slice_params_list: page_num = k['page_num'] self.lg.info('create task[where page_num: {}]...'.format(page_num)) func_args = [ page_num, ] tasks.append(self.loop.create_task( unblock_func( func_name=self._get_one_page_transaction_details_by_something, func_args=func_args, logger=self.lg,))) one_res = await async_wait_tasks_finished(tasks=tasks) try: del tasks except: pass for i in one_res: for j in i: all_res.append(j) return all_res @catch_exceptions_with_class_logger(default_res=[]) def _get_one_page_transaction_details_by_something(self, page_num: int, start_date: str=None, end_date: str=None, transaction_status: str='', mer_name: str='', order_no: str='', mid: str='', agent_name: str='', pay_channel: str ='', sale_name: str='',) -> list: """ 获取单页交易流水 :param page_num: 开始页面, eg: 1, 2, 3 :param start_date: eg: '2019-07-16 00:00' :param end_data: eg: '2019-07-16 10:02' :param transaction_status: 交易状态 | 选择全部: '' or 交易成功: '1' or 退款成功: '3' :param mer_name: 待查询的商户名称 :param order_no: 订单号 :param mid: 商户编号 :param agent_name: 顶级机构名称 :param pay_channel: 支付渠道 | 请选择: '' or 微信: '50' or 支付宝: '51' or 微信条码: '55' or 支付宝条码: '56' or 微信小程序: '67' :param sale_name: 销售名称 :return: """ res = [] start_date = self.get_0_00_on_the_day() if start_date is None else start_date end_date = str(get_shanghai_time()) if end_date is None else end_date headers = self.get_random_pc_headers() headers.update({ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': '*/*', 'Referer': 'https://agent.yrmpay.com/JHAdminConsole/limafuReport/transflow.do', 'X-Requested-With': 'XMLHttpRequest', }) params = ( ('_dc', get_now_13_bit_timestamp()), ) data = { 'startDate': start_date, 'endDate': end_date, 'type': '2', 'status': transaction_status, 'payChannel': pay_channel, 'orderNo': order_no, 'merName': mer_name, 'mid': mid, 'agentName': agent_name, 'saleName': sale_name, 'page': str(page_num), 'start': str((page_num - 1) * 20), # 开始位置, 0, 20, 40 'limit': '20', } url = 'https://agent.yrmpay.com/JHAdminConsole/limafuReport/querylimafuTransFlow.do' body = Requests.get_url_body( method='post', url=url, headers=headers, params=params, cookies=self.login_cookies_dict, data=data, ip_pool_type=self.ip_pool_type, num_retries=self.num_retries,) assert body != '', 'body不为空值!' res = json_2_dict( json_str=body, logger=self.lg, default_res={}).get('data', []) self.lg.info('[{}] page_num: {}'.format( '+' if res != [] else '-', page_num, )) return res def get_0_00_on_the_day(self) -> str: """ 获取当天的0点 :return: """ now_time = get_shanghai_time() return str(datetime( year=now_time.year, month=now_time.month, day=now_time.day)) def get_1_on_the_month(self) -> str: """ 获取当月的第一天 :return: """ now_time = get_shanghai_time() # 避免月底流水无法获取 day = 5 now_month = now_time.month if now_month > 1: now_month -= 1 else: # now_month为1月份 now_month = 12 return str(datetime( year=now_time.year, month=now_month, day=day,)) def _get_proxies(self) -> dict: """ 获取代理 :return: """ proxies = Requests._get_proxies(ip_pool_type=self.ip_pool_type, ) assert proxies != {}, 'proxies不为空dict!' return proxies async def _get_random_pc_headers(self) -> dict: """ :return: """ return self.get_random_pc_headers() @staticmethod def get_random_pc_headers() -> dict: headers = get_random_headers( upgrade_insecure_requests=False, cache_control='',) headers.update({ 'Origin': 'https://agent.yrmpay.com', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', # 'Content-Type': 'multipart/form-data; boundary=----WebKitFormBoundarytSJCAoaErjNY4IbM', 'accept': 'text/plain, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', }) return headers def __del__(self): try: del self.lg del self.login_cookies_dict except: pass try: del self.loop except: pass collect()
class TaoBaoWeiTaoShareParse(AsyncCrawler): def __init__( self, logger=None, *params, **kwargs, ): AsyncCrawler.__init__( self, *params, **kwargs, logger=logger, ip_pool_type=IP_POOL_TYPE, log_print=True, log_save_path=MY_SPIDER_LOGS_PATH + '/淘宝/微淘/', ) self._set_headers() self.msg = '' self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() def _set_headers(self): self.headers = { 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': '*/*', 'referer': 'https://market.m.taobao.com/apps/market/content/index.html?ut_sk=1.VmYadv9DXkkDAFZm0VV4JBNq_21380790_1527298517854.Copy.33¶ms=%7B%22csid%22%3A%2254a52aea54b7c29d289a0e36b2bf2f51%22%7D&wh_weex=true&contentId=200668154273&source=weitao_2017_nocover&data_prefetch=true&suid=3D763077-A7BF-43BC-9092-C17B35E896F9&wx_navbar_transparent=false&wx_navbar_hidden=false&sourceType=other&un=bc80c9f324602d31384c4a342af87869&share_crt_v=1&sp_tk=o6R2Q0ZDMHZvaDBlS6Ok&cpp=1&shareurl=true&spm=a313p.22.68.948703884987&short_name=h.WAjz5RP&app=chrome', 'authority': 'h5api.m.taobao.com', # cookie得注释掉, 否则为非法请求 # 'cookie': '' } async def _get_target_url_and_content_id_and_csid(self, taobao_short_url): ''' 根据给与的淘宝分享短链接, 得到target_url, content_id, csid :param taobao_short_url: :return: ''' if re.compile(r'contentId').findall(taobao_short_url) != []: # 先检查是否已为目标地址 target_url = taobao_short_url else: body = Requests.get_url_body( url=taobao_short_url, headers=self.headers, ip_pool_type=self.ip_pool_type, ) # self.lg.info(str(body)) if body == '': self.lg.error('获取到的body为空值, 出错短链接地址: {0}'.format( str(taobao_short_url))) return '', '', '' try: # 获取短连接的目标地址 target_url = re.compile('var url = \'(.*?)\';').findall( body)[0] self.lg.info('获取到原始连接: {}'.format(target_url)) except IndexError: self.lg.error('获取target_url的时候IndexError! 出错短链接地址: {0}'.format( str(taobao_short_url))) target_url = '' try: # 得到contentId content_id = re.compile('contentId=(\d+)').findall(target_url)[0] self.lg.info(content_id) except IndexError: self.lg.error('获取content_id时IndexError! 出错短链接地址: {0}'.format( str(taobao_short_url))) content_id = '' try: # 得到csid csid = re.compile('csid%22%3A%22(.*?)%22%7D').findall( target_url)[0] # self.lg.info(csid) except IndexError: self.lg.info('此链接为无csid情况的链接...') # self.lg.error('获取csid时IndexError! 出错短链接地址: {0}'.format(str(taobao_short_url))) csid = '' try: tag_name = re.compile('tagName=(.*?)&').findall(target_url)[0] except IndexError: tag_name = '' try: tag = re.compile('tag=(.*?)&').findall(target_url)[0] except IndexError: tag = '' return target_url, content_id, csid, tag_name, tag async def _get_api_body(self, taobao_short_url): ''' 获取该页面api返回的文件 :param taobao_short_url: :return: body 类型 str ''' base_url = 'https://h5api.m.taobao.com/h5/mtop.taobao.beehive.detail.contentservicenewv2/1.0/' try: target_url, content_id, csid, tag_name, tag = await self._get_target_url_and_content_id_and_csid( taobao_short_url) except ValueError: self.lg.error('遇到ValueError!', exc_info=True) return '' if content_id == '' and csid == '': # 异常退出 return '' data = dumps({ 'businessSpm': '', 'business_spm': '', 'contentId': content_id, 'params': dumps({ "csid": csid, }) if csid != '' else '', # 没有csid时,就不传这个参数 'source': 'weitao_2017_nocover', 'tagName': tag_name, # 这个是我自己额外加的用于获取tags的api接口 'track_params': '', 'type': 'h5', }) params = { 'AntiCreep': 'true', 'AntiFlood': 'true', 'api': 'mtop.taobao.beehive.detail.contentservicenewv2', 'appKey': '12574478', 'callback': 'mtopjsonp1', # 'data': '{"contentId":"200668154273","source":"weitao_2017_nocover","type":"h5","params":"{\\"csid\\":\\"54a52aea54b7c29d289a0e36b2bf2f51\\"}","businessSpm":"","business_spm":"","track_params":""}', 'data': data, 'dataType': 'jsonp', 'data_2': '', 'jsv': '2.4.11', # 'sign': 'e8cb623e58bab0ceb10e9edffdacd5b2', # 't': '1527300457911', 'type': 'jsonp', 'v': '1.0' } # TODO 新版 # 必传参数(无cookies, sign正确也无结果!) # 而且登录后的cookies, 但是继续采集, tb会报: 亲,访问被拒绝了哦!请检查是否使用了代理软件或VPN哦~ result_1 = await get_taobao_sign_and_body( base_url=base_url, headers=self.headers, params=params, data=data, logger=self.lg, ip_pool_type=self.ip_pool_type) _m_h5_tk = result_1[0] if _m_h5_tk == '': self.lg.error( '获取到的_m_h5_tk为空str! 出错短链接地址: {0}'.format(taobao_short_url)) # 带上_m_h5_tk, 和之前请求返回的session再次请求得到需求的api数据 result_2 = await get_taobao_sign_and_body( base_url=base_url, headers=self.headers, params=params, data=data, _m_h5_tk=_m_h5_tk, session=result_1[1], logger=self.lg, ip_pool_type=self.ip_pool_type) body = result_2[2] return body async def _deal_with_api_info(self, taobao_short_url): ''' 处理api返回的信息, 并结构化存储 :param taobao_short_url: :return: ''' data = await self._get_api_body(taobao_short_url) if data == '': self.lg.error('获取到的api数据为空值!') return {} try: data = re.compile('mtopjsonp1\((.*)\)').findall(data)[0] except IndexError: self.lg.error( 're获取主信息失败, IndexError, 出错短链接地址:{0}'.format(taobao_short_url)) data = {} try: data = await self._wash_api_info(loads(data)) # pprint(data) except Exception as e: self.lg.error('出错短链接地址:{0}'.format(taobao_short_url)) self.lg.exception(e) return {} article = await self._get_article(data=data, taobao_short_url=taobao_short_url) pprint(article) if article != {} and article.get('share_id', '') != '': '''采集该文章推荐的商品''' await self._crawl_and_save_these_goods( goods_url_list=article.get('goods_url_list', [])) '''存储该文章info''' await self._save_this_article(article=article) return True else: self.lg.info('获取到的文章失败! article为空dict!') return False async def _crawl_and_save_these_goods(self, goods_url_list): ''' 采集该文章推荐的商品 :param goods_url_list: :return: ''' sql_str = 'select GoodsID from dbo.GoodsInfoAutoGet where SiteID=1 or SiteID=3 or SiteID=4 or SiteID=6' try: result = self.my_pipeline._select_table(sql_str=sql_str) except TypeError: result = [] self.lg.info('即将开始抓取该文章的goods, 请耐心等待...') index = 1 db_all_goods_id_list = [item[0] for item in result] for item in goods_url_list: try: goods_id = re.compile(r'id=(\d+)').findall( item.get('goods_url', ''))[0] except IndexError: self.lg.error('re获取goods_id时出错, 请检查!') continue if goods_id in db_all_goods_id_list: self.lg.info('该goods_id[{0}]已存在于db中!'.format(goods_id)) continue else: taobao = TaoBaoLoginAndParse(logger=self.lg) if index % 50 == 0: # 每50次重连一次,避免单次长连无响应报错 self.lg.info('正在重置,并与数据库建立新连接中...') self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline() self.lg.info('与数据库的新连接成功建立...') if self.my_pipeline.is_connect_success: goods_id = taobao.get_goods_id_from_url( item.get('goods_url', '')) if goods_id == '': self.lg.info('@@@ 原商品的地址为: {0}'.format( item.get('goods_url', ''))) continue else: self.lg.info( '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (goods_id, str(index))) tt = taobao.get_goods_data(goods_id) data = taobao.deal_with_data(goods_id=goods_id) if data != {}: data['goods_id'] = goods_id data[ 'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str( goods_id) data['username'] = '******' data['main_goods_id'] = None # print('------>>>| 爬取到的数据为: ', data) taobao.old_taobao_goods_insert_into_new_table( data, pipeline=self.my_pipeline) else: pass else: # 表示返回的data值为空值 self.lg.info('数据库连接失败,数据库可能关闭或者维护中') pass index += 1 gc.collect() await asyncio.sleep(TAOBAO_REAL_TIMES_SLEEP_TIME) self.lg.info('该文章的商品已经抓取完毕!') return True async def _save_this_article(self, article): ''' 存储该文章info :param article: :return: ''' sql_str = 'select share_id from dbo.daren_recommend' db_share_id = [ j[0] for j in list(self.my_pipeline._select_table(sql_str=sql_str)) ] if article.get('share_id') in db_share_id: self.lg.info('该share_id({})已存在于数据库中, 此处跳过!'.format( article.get('share_id', ''))) return True else: self.lg.info('即将开始存储该文章...') if self.my_pipeline.is_connect_success: params = await self._get_db_insert_params(item=article) # pprint(params) sql_str = r'insert into dbo.daren_recommend(nick_name, head_url, profile, share_id, gather_url, title, comment_content, share_goods_base_info, div_body, create_time, site_id) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' self.my_pipeline._insert_into_table_2(sql_str=sql_str, params=params, logger=self.lg) return True else: self.lg.error('db连接失败!存储失败! 出错article地址:{0}'.format( article.get('gather_url', ''))) return False async def _get_db_insert_params(self, item): params = ( item['nick_name'], item['head_url'], item['profile'], item['share_id'], item['gather_url'], item['title'], item['comment_content'], # dumps(item['share_img_url_list'], ensure_ascii=False), # dumps(item['goods_id_list'], ensure_ascii=False), dumps(item['share_goods_base_info'], ensure_ascii=False), item['div_body'], item['create_time'], item['site_id'], ) return params async def _get_article(self, data, taobao_short_url): ''' 得到该文章的需求信息 :param data: :return: ''' try: nick_name = data.get('data', {}).get('models', {}).get('account', {}).get('name', '') assert nick_name != '', '获取到的nick_name为空值!' head_url = await self._get_head_url(data=data) # 推荐人的简介或者个性签名 tmp_profile = data.get('data', {}).get('models', {}).get('account', {}).get('accountDesc', '') profile = tmp_profile if tmp_profile is not None else '' title = self._wash_sensitive_info( data.get('data', {}).get('models', {}).get('content', {}).get('title', '')) # self.lg.info(title) assert title != '', '获取到的title为空值!请检查!' # 达人的评论,可用于荐好首页的文字信息 comment_content = self._wash_sensitive_info( data.get('data', {}).get('models', {}).get('content', {}).get('summary', '')) '''微淘抓包的接口: 图片,商品依次对应''' tmp_goods_list = data.get('data', {}).get('models', {}).get( 'content', {}).get('drawerList', []) assert tmp_goods_list != [], '获取到的goods_id_list为空list! 请检查! 可能该文章推荐商品为空[]!' share_img_url_list = [{ 'img_url': 'https:' + item.get('itemImages', [])[0].get('picUrl', '') } for item in tmp_goods_list] goods_id_list = [{ 'goods_id': item.get('itemId', '') } for item in tmp_goods_list] # 由于微淘的图片跟商品信息一一对应,so直接存一个字段, 清除重复的推荐商品(list去重,并保持原来的顺序) share_goods_base_info = list_duplicate_remove([{ 'img_url': 'https:' + item.get('itemImages', [])[0].get('picUrl', ''), 'goods_id': item.get('itemId', ''), } for item in tmp_goods_list]) # div_body div_body = self._wash_sensitive_info( await self._get_div_body(rich_text=data.get('data', {}).get( 'models', {}).get('content', {}).get('richText', []))) # print(div_body) # 待抓取的商品地址, 统一格式为淘宝的,如果是tmall地址, 浏览器会重定向到天猫 goods_url_list = [{ 'goods_url': 'https://item.taobao.com/item.htm?id=' + item.get('goods_id', '') } for item in goods_id_list] _ = ( await self._get_target_url_and_content_id_and_csid(taobao_short_url)) gather_url = _[0] share_id = _[1] # 即content_id create_time = get_shanghai_time() site_id = 2 # 淘宝微淘 # tags 额外的文章地址 tags = await self._get_tags(data=data) # pprint(tags) except Exception as e: self.lg.error('出错短链接地址:{0}'.format(taobao_short_url)) self.lg.exception(e) return {} article = WellRecommendArticle() article['nick_name'] = nick_name article['head_url'] = head_url article['profile'] = profile article['share_id'] = share_id article['title'] = title article['comment_content'] = comment_content article['share_img_url_list'] = share_img_url_list article['goods_id_list'] = goods_id_list article['div_body'] = div_body article['gather_url'] = gather_url article['create_time'] = create_time article['site_id'] = site_id article['goods_url_list'] = goods_url_list article['tags'] = tags article['share_goods_base_info'] = share_goods_base_info return article async def _get_head_url(self, data): ''' 获取头像地址 :param data: :return: ''' tmp_head_url = data.get('data', {}).get('models', {}).get('account', {}).get('accountPic', {}).get('picUrl', '') if tmp_head_url != '': if re.compile('http').findall(tmp_head_url) == []: head_url = 'https:' + tmp_head_url else: head_url = tmp_head_url else: head_url = '' return head_url def _wash_sensitive_info(self, data): ''' 清洗敏感信息 :param data: :return: ''' data = re.compile('淘宝|天猫|taobao|tmall|TAOBAO|TMALL').sub('', data) return data async def _get_tags(self, data): ''' 获得额外文章的信息 :param data: :return: ''' tags = data.get('data', {}).get('models', {}).get('tags', []) tags = [{ 'url': unquote(item.get('url', '')), 'name': item.get('name', ''), } for item in tags] return tags async def _get_div_body(self, rich_text): ''' 处理得到目标文章 :param rich_text: 待处理的原文章 :return: ''' div_body = '' for item in rich_text: if item.get('resource') is None: continue for resource_item in item.get('resource', []): # 可能是多个 # resource = item.get('resource', [])[0] text = resource_item.get('text', '') # 介绍的文字 picture = resource_item.get('picture', {}) # 介绍的图片 _goods = resource_item.get('item', {}) # 一个商品 if text != '': text = '<p style="height:auto;width:100%">' + text + '</p>' + '<br>' div_body += text continue if picture != {}: # 得到该图片的宽高,并得到图片的<img>标签 _ = r'<img src="{0}" style="height:{1}px;width:{2}px;"/>'.format( 'https:' + picture.get('picUrl', ''), picture.get('picHeight', ''), picture.get('picWidth', '')) _ = _ + '<br>' div_body += _ continue if _goods != {}: _hiden_goods_id = r'<p style="display:none;">此处有个商品[goods_id]: {0}</p>'.format( _goods.get('itemId', '')) + '<br>' div_body += _hiden_goods_id continue return '<div>' + div_body + '</div>' if div_body != '' else '' async def _wash_api_info(self, data): ''' 清洗接口 :param data: :return: ''' try: data['data']['assets'] = [] data['data']['models']['config'] = {} data['data']['modules'] = [] except Exception: pass return data def __del__(self): try: del self.lg del self.msg del self.my_pipeline except: pass gc.collect()