def _get_origin_comment_list(self, **kwargs) -> list: ''' 得到加密的接口数据信息 :param kwargs: :return: ''' csrf = kwargs.get('csrf', '') goods_id = kwargs.get('goods_id', '') cookies = kwargs.get('cookies', '') url = 'https://m.1688.com/page/offerRemark.htm' headers = { 'cookie': cookies, 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'user-agent': get_random_pc_ua(), 'accept': 'application/json, text/javascript, */*; q=0.01', 'referer': 'https://m.1688.com/page/offerRemark.htm?offerId={}'.format(goods_id), 'authority': 'm.1688.com', 'x-requested-with': 'XMLHttpRequest', } origin_comment_list = [] for i in range(1, self.max_page): __wing_navigate_options = { 'data': { 'bizType': 'trade', 'itemId': int(goods_id), 'offerId': str(goods_id), 'page': i, 'pageSize': 5, # 'receiveUserId': 989036456, 'starLevel': 7 } } params = ( ('_csrf', csrf), ('__wing_navigate_type', 'view'), ('__wing_navigate_url', 'detail:modules/offerRemarkList/view'), ('__wing_navigate_options', dumps(__wing_navigate_options)), ('_', str(datetime_to_timestamp(get_shanghai_time())) + str(get_random_int_number(start_num=100, end_num=999))), ) body = Requests.get_url_body(url=url, headers=headers, params=params, ip_pool_type=self.ip_pool_type) data = json_2_dict(body, encoding='ascii').get('data', {}) # pprint(data) one = data.get('model', []) pprint(one) origin_comment_list += one sleep(.25) return origin_comment_list
def _get_params(self, goods_id): ''' 得到获取sku_info的params :param goods_id: :return: ''' t = str(datetime_to_timestamp(get_shanghai_time())) + str(get_random_int_number(start_num=100, end_num=999)) params = ( ('t', t), ('goodsId', str(goods_id)), # ('provinceCode', '330000'), # ('cityCode', '330100'), # ('districtCode', '330102'), ) return params
def _get_simulate_log_info(retries=10) -> str: ''' print仿生log.info :return: ''' time_str = lambda x='': str(get_shanghai_time()) + ',' + str( get_random_int_number(100, 999)) + ' [INFO ] ➞ ' try: time_str = time_str() except ValueError: if retries > 0: return _get_simulate_log_info(retries - 1) else: return '' return time_str
def get_random_sqlite_obj() -> BaseSqlite3Cli: """ 获取随机sqlite 对象 :return: """ global sqlite3_cli0, sqlite3_cli1, sqlite3_cli2, sqlite3_cli3, sqlite3_cli4 random_num = get_random_int_number(0, 4) if random_num == 0: return sqlite3_cli0 elif random_num == 1: return sqlite3_cli1 elif random_num == 2: return sqlite3_cli2 elif random_num == 3: return sqlite3_cli3 elif random_num == 4: return sqlite3_cli4 else: raise NotImplemented
def _get_one_page_articles(self, page_num) -> list: ''' 得到一页新闻 :param page_num: :return: ''' headers = { 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'User-Agent': get_random_pc_ua(), 'Accept': '*/*', 'Referer': 'https://36kr.com/', 'Connection': 'keep-alive', } params = ( ('per_page', '20'), ('page', str(page_num)), ('_', str(datetime_to_timestamp(get_shanghai_time())) + str(get_random_int_number(100, 999))), ) url = 'https://36kr.com/api/search-column/mainsite' data = json_2_dict( Requests.get_url_body(url=url, headers=headers, params=params, cookies=None)).get('data', {}).get('items', []) # pprint(data) if data == []: return [] [ item.update({'user_info': json_2_dict(item.get('user_info', ''))}) for item in data ] # pprint(data) return data
def _parse_page_from_wx(self, **kwargs): ''' 解析wx单个article的info :param kwargs: :return: a WellRecommendArticle object ''' article_link = kwargs.get('article_link', '') article_info = kwargs.get('article_info', {}).get('data', {}) article_likes = kwargs.get('article_likes', get_random_int_number()) error_msg = '出错article_url: {0}'.format(article_link) try: nick_name = article_info.get('user', {}).get('nickname', '') assert nick_name != '', '获取到的nick_name为空值!请检查!' + error_msg head_url = article_info.get('user', {}).get('images', '') assert head_url != '', '获取到的head_url为空值!请检查!' + error_msg profile = '' # 个人简介或者个性签名(留空) share_id = article_info.get('id', '') assert share_id != '', '获取到的share_id为空值!请检查!' + error_msg title = self.wash_sensitive_info(article_info.get('title', '')) # title默认留空 comment_content = self.wash_sensitive_info( article_info.get('desc', '')) assert comment_content != '', '获取到的comment_content为空!请检查!' + error_msg share_img_url_list = [{ # 如果是视频的话, 则里面第一章图片就是视频第一帧 'img_url': item.get('original', ''), 'height': item.get('height'), # 图片高宽 'width': item.get('width'), } for item in article_info.get('images_list', [])] assert share_img_url_list != [], '获取到的share_img_url_list为空list!请检查!' + error_msg div_body = '' # 默认留空 gather_url = article_link # 原文章原始的创建日期 tmp_create_time = article_info.get('time', '') assert tmp_create_time != '', '获取到的create_time为空值!请检查!' create_time = string_to_datetime(tmp_create_time + ':00') site_id = 3 # 小红书 goods_url_list = [] # 该文章待抓取的商品地址 share_goods_base_info = [] # wx端tags没有返回值 tags = self._get_tags_from_wx(article_info=article_info) # 视频播放地址 tmp_video_url = article_info.get('video', '') tmp_video_url = re.compile('\?.*').sub('', tmp_video_url) video_url = re.compile(r'//sa.').sub(r'//v.', tmp_video_url) likes = article_likes collects = article_info.get('fav_count', None) assert collects is not None, '获取到的collects为None!请检查!' + error_msg except Exception: sleep(self.CRAWL_ARTICLE_SLEEP_TIME) self.lg.error('遇到错误:', exc_info=True) return {} _ = WellRecommendArticle() _['nick_name'] = nick_name _['head_url'] = head_url _['profile'] = profile _['share_id'] = share_id _['title'] = title _['comment_content'] = comment_content _['share_img_url_list'] = share_img_url_list _['div_body'] = div_body _['gather_url'] = gather_url _['create_time'] = create_time _['site_id'] = site_id _['goods_url_list'] = goods_url_list _['tags'] = tags _['share_goods_base_info'] = share_goods_base_info _['video_url'] = video_url _['likes'] = likes _['collects'] = collects return _
''' from requests import session from requests_toolbelt import MultipartEncoder from fzutils.spider.fz_requests import Requests from fzutils.time_utils import ( get_shanghai_time, datetime_to_timestamp, ) from fzutils.common_utils import get_random_int_number cookies = { 'yd_cookie': '2369844f-fc3f-42742d88d5deabc0ec65d866d61526e32347', } _t = str(datetime_to_timestamp(get_shanghai_time())) + str( get_random_int_number(100, 999)) data = MultipartEncoder( fields={ 'PageIndex': '1', 'PageSize': '20', # 'TimesTamp': '1547813627151', 'TimesTamp': _t, 'UserId': '259146', 'sign': '42531e765ce3055f25f369db3505db8f' }) headers = { 'Host': 'api.yiuxiu.com', 'accept': 'application/json', # 'content-type': 'multipart/form-data; boundary=Boundary+C98168C62FD125E1', 'content-type': data.content_type, # 'token': '', # jwt token