def _get_rank_fund_info(self): ''' 得到天天基金全部基金的rank_fund :return: a list ''' rank_fund_list = [] for page_num in range(self.page_num_start, self.page_num_end): print('正在抓取第{0}页的基金信息...'.format(page_num)) cookies = { 'st_pvi': '11586003301354', 'EMFUND1': 'null', 'EMFUND0': 'null', 'EMFUND2': '07-10%2018%3A01%3A38@%23%24%u534E%u6DA6%u5143%u5927%u73B0%u91D1%u901A%u8D27%u5E01B@%23%24002884', 'EMFUND3': '07-10%2018%3A01%3A48@%23%24%u5929%u5F18%u73B0%u91D1%u7BA1%u5BB6%u8D27%u5E01B@%23%24420106', 'EMFUND4': '07-10%2018%3A11%3A53@%23%24%u65B9%u6B63%u5BCC%u90A6%u4FDD%u9669%u4E3B%u9898%u6307%u6570%u5206%u7EA7@%23%24167301', 'EMFUND5': '07-10%2018%3A04%3A32@%23%24%u62DB%u5546%u4E2D%u8BC1%u94F6%u884C%u6307%u6570%u5206%u7EA7@%23%24161723', 'EMFUND6': '07-10%2018%3A05%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570C@%23%24001595', 'EMFUND7': '07-10%2018%3A06%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570A@%23%24001594', 'st_si': '38764934559714', 'ASP.NET_SessionId': 'hqeo1xk5oqgwb0cqzxicytda', 'EMFUND8': '07-11 11:28:55@#$%u7533%u4E07%u83F1%u4FE1%u591A%u7B56%u7565%u7075%u6D3B%u914D%u7F6E%u6DF7%u5408A@%23%24001148', 'EMFUND9': '07-11 11:28:55@#$%u5E7F%u53D1%u751F%u7269%u79D1%u6280%u6307%u6570%28QDII%29@%23%24001092', } headers = { 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': '*/*', # 'Referer': 'http://fund.eastmoney.com/data/fundranking.html', 'Proxy-Connection': 'keep-alive', } end_date = str(get_shanghai_time())[:10] start_date = str( datetime.datetime(year=get_shanghai_time().year - 1, month=get_shanghai_time().month, day=get_shanghai_time().day))[:10] print('开始时间: {0}, 结束时间: {1}'.format(start_date, end_date)) params = ( ('op', 'ph'), ('dt', 'kf'), ('ft', 'all'), ('rs', ''), ('gs', '0'), ('sc', 'zzf'), ('st', 'desc'), ('sd', start_date), # '2017-07-10' ('ed', end_date), # '2018-07-10' ('qdii', ''), ('tabSubtype', ',,,,,'), ('pi', str(page_num)), # rank_data的页码 ('pn', '50'), ('dx', '1'), # ('v', '0.5290053467389759'), ) url = 'http://fund.eastmoney.com/data/rankhandler.aspx' # TODO 常规requests被502 # body = MyRequests.get_url_body(url=url, headers=headers, params=params, cookies=None) # print(body) # 用phantomjs body = self.my_phantomjs.get_url_body( url=_get_url_contain_params(url, params)) try: body = re.compile('<body>(.*)</body>').findall(body)[0] this_page_rank_data = re.compile(r'rankData = (.*);').findall( body)[0] # print(this_page_rank_data) except IndexError: print('在获取this_page_rank_data时索引异常!请检查!') continue # 报错: Expecting property name enclosed in double quotes: line 1 column 2 (char 1) # 解决方案: 用demjson处理下 this_page_rank_data = demjson.decode(this_page_rank_data).get( 'datas', {}) # pprint(this_page_rank_data) if this_page_rank_data == {}: return [] for item in this_page_rank_data: _i = item.split(',') rank_fund_list.append({ '基金代码': _i[0], '基金简称': _i[1], '当天日期': _i[3], '单位净值': _i[4], '累计净值': _i[5], '日增长率': _i[6], '近1周': _i[7], '近1月': _i[8], '近3月': _i[9], '近6月': _i[10], '近1年': _i[11], '近2年': _i[12], '近3年': _i[13], '今年来': _i[14], '成立来': _i[15], '手续费': _i[20], }) sleep(2.5) print('\n抓取完毕!\n') # pprint(rank_fund_list) return rank_fund_list
def _get_one_fund_info(self, fund_code): ''' 得到一只基金的info,并处理 :return: ''' cookies = { 'st_pvi': '11586003301354', 'st_si': '46806950936799', 'ASP.NET_SessionId': 'fhllwae2zicg00o0x4ub1fxs', 'EMFUND1': 'null', 'EMFUND0': 'null', # 'EMFUND2': '07-10%2018%3A01%3A38@%23%24%u534E%u6DA6%u5143%u5927%u73B0%u91D1%u901A%u8D27%u5E01B@%23%24002884', 'EMFUND2': '07-10 18:01:38@#$华润元大现金通货币B@#$002884', # 'EMFUND3': '07-10%2018%3A01%3A48@%23%24%u5929%u5F18%u73B0%u91D1%u7BA1%u5BB6%u8D27%u5E01B@%23%24420106', 'EMFUND3': '07-10 18:01:48@#$天弘现金管家货币B@#$420106', # 'EMFUND4': '07-10%2018%3A11%3A53@%23%24%u65B9%u6B63%u5BCC%u90A6%u4FDD%u9669%u4E3B%u9898%u6307%u6570%u5206%u7EA7@%23%24167301', 'EMFUND4': '07-10 18:11:53@#$方正富邦保险主题指数分级@#$167301', # 'EMFUND5': '07-10%2018%3A04%3A32@%23%24%u62DB%u5546%u4E2D%u8BC1%u94F6%u884C%u6307%u6570%u5206%u7EA7@%23%24161723', 'EMFUND5': '07-10 18:04:32@#$招商中证银行指数分级@#$161723', # 'EMFUND6': '07-10%2018%3A05%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570C@%23%24001595', 'EMFUND6': '07-10 18:05:13@#$天弘中证银行指数C@#$001595', # 'EMFUND7': '07-10%2018%3A06%3A13@%23%24%u5929%u5F18%u4E2D%u8BC1%u94F6%u884C%u6307%u6570A@%23%24001594', 'EMFUND7': '07-10 18:06:13@#$天弘中证银行指数A@#$001594', # 'EMFUND8': '07-10%2018%3A11%3A22@%23%24%u7533%u4E07%u83F1%u4FE1%u591A%u7B56%u7565%u7075%u6D3B%u914D%u7F6E%u6DF7%u5408A@%23%24001148', 'EMFUND8': '07-10 18:11:22@#$申万菱信多策略灵活配置混合A@#$001148', # 'EMFUND9': '07-10 18:12:26@#$%u5E7F%u53D1%u751F%u7269%u79D1%u6280%u6307%u6570%28QDII%29@%23%24001092', 'EMFUND9': '07-10 18:12:26@#$广发生物科技指数(QDII)@#$001092', } cookies = unquote_cookies(cookies) # pprint(cookies) headers = { 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept': '*/*', # 'Referer': 'http://fund.eastmoney.com/001092.html', 'Proxy-Connection': 'keep-alive', } v = re.compile(r'-| |:').sub('', str( get_shanghai_time())) # 2018-07-10 18:30:46 -> 20180710183046 # print(v) params = ( # ('v', '20180710175951'), # 时间 ('v', v), # 时间 ) fund_url = 'http://fund.eastmoney.com/pingzhongdata/{0}.js'.format( fund_code) # response = requests.get(fund_url, headers=headers, params=params, cookies=None) # body = response.text # print(body) # body = MyRequests.get_url_body(url=fund_url, headers=headers, params=params, cookies=None) # print(body) body = self.my_phantomjs.get_url_body( url=_get_url_contain_params(fund_url, params)) # print(body) self._get_this_fund_info(body=body) return True
def get_home_page_info_by_page_num(self, page_num: int) -> list: """ 根据page_num获取单页的信息 :param page_num: :return: """ def parse_page_info(body) -> list: """ 解析 :param body: :return: """ # div item li_sel = { 'method': 'css', 'selector': 'div.center-wrap a.random_list', } title_sel = { 'method': 'css', 'selector': 'div.random_title ::text', } create_time_sel = { 'method': 'css', 'selector': 'div.date ::text', } article_img_url_sel = { 'method': 'css', 'selector': 'div.random_article img ::attr("data-original")', } article_img_name_sel = { 'method': 'css', 'selector': 'div.random_article img ::attr("alt")', } li_list = parse_field( parser=li_sel, target_obj=body, is_first=False, ) res = [] for item in li_list: # pprint(item) try: title = parse_field( parser=title_sel, target_obj=item, ) assert title != '' create_time = parse_field( parser=create_time_sel, target_obj=item, ) assert create_time != '' article_img_url_list = parse_field( parser=article_img_url_sel, target_obj=item, is_first=False, ) assert article_img_url_list != [] article_img_name_list = parse_field( parser=article_img_name_sel, target_obj=item, is_first=False, ) assert article_img_name_list != [] article_img_list = list( zip(article_img_name_list, article_img_url_list)) article_img_list = [{ 'img_name': i[0], 'img_url': i[1], } for i in article_img_list] except (AssertionError, IndexError) as e: # print(e) continue res.append({ 'title': title, 'create_time': create_time, 'article_img_list': article_img_list, }) return res headers = self.get_random_phone_headers() headers.update({ 'authority': 'www.doutula.com', 'referer': 'https://www.doutula.com/', }) params = (('page', str(page_num)), ) url = 'https://www.doutula.com/article/list/' # TODO 用requests乱码 # body = Requests.get_url_body( # url=url, # headers=headers, # params=params, # ip_pool_type=self.ip_pool_type, # num_retries=self.request_num_retries, # encoding='utf-8',) # print(body) # 改用driver d = BaseDriver(ip_pool_type=tri_ip_pool, user_agent_type=PHONE) body = d.get_url_body( url=_get_url_contain_params(url=url, params=params)) # print(body) try: del d except: pass res = parse_page_info(body=body) print('[{}] page_num: {}'.format( '+' if res != [] else '-', page_num, )) collect() return res
def _get_goods_data(self, goods_id): ''' 得到需求数据 :param goods_id: :return: ''' if goods_id == '': self.my_lg.error('获取到的goods_id为空值!此处跳过!') return self._get_data_error_init() # 网易严选m站抓取 url = 'http://m.you.163.com/item/detail' params = self._get_params(goods_id=goods_id) m_url = url + '?id={0}'.format(goods_id) self.my_lg.info('------>>>| 正在抓取严选地址为: {0}'.format(m_url)) write_info = '出错goods_id:{0}, 出错地址: {1}'.format(goods_id, m_url) '''requests被无限转发''' # body = MyRequests.get_url_body(url=url, headers=self.headers, params=params) # self.my_lg.info(str(body)) '''改用phantomjs''' body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=_get_url_contain_params(url=url, params=params)) if body == '': self.my_lg.error('获取到的body为空值!'+write_info) return self._get_data_error_init() try: body = re.compile('var jsonData=(.*?),policyList=').findall(body)[0] except IndexError: self.my_lg.error('获取body时索引异常!'+write_info, exc_info=True) return self._get_data_error_init() body = nonstandard_json_str_handle(json_str=body) # self.my_lg.info(str(body)) _ = json_2_dict( json_str=body, logger=self.my_lg) # pprint(_) if _ == {}: self.my_lg.error('获取到的data为空dict!'+write_info) return self._get_data_error_init() _ = self._wash_data(_) data = {} try: data['title'] = self._wash_sensitive_info(self._get_title(data=_)) data['sub_title'] = self._wash_sensitive_info(self._get_sub_title(data=_)) data['shop_name'] = '' data['all_img_url'] = self._get_all_img_url(data=_) data['p_info'] = self._get_p_info(data=_) data['div_desc'] = self._get_div_desc(data=_) data['sell_time'] = self._get_sell_time(data=_) data['detail_name_list'] = self._get_detail_name_list(data=_.get('skuSpecList', [])) data['price_info_list'] = self._get_price_info_list(data=_.get('skuList', [])) data['price'], data['taobao_price'] = self._get_price_and_taobao_price( price_info_list=data['price_info_list'] ) if data['price'] == 0 or data['taobao_price'] == 0: # 售罄商品处理 data['is_delete'] = 1 else: data['is_delete'] = self._get_is_delete(price_info_list=data['price_info_list'], data=data, other=_) except Exception: self.my_lg.error('遇到错误:', exc_info=True) self.my_lg.error(write_info) return self._get_data_error_init() if data != {}: self.result_data = data return data else: self.my_lg.info('data为空值') return self._get_data_error_init()
def _get_comment_data(self, type: int, goods_id): if goods_id == '' or type == '': self.result_data = {} return {} self.lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) '''先获取到sellerId''' try: seller_id = self._get_seller_id(type=type, goods_id=goods_id) except AssertionError or IndexError as e: self.lg.error('出错goods_id: %s' % goods_id) self.lg.error(e.args[0]) self.result_data = {} self.random_sku_info_list = [] return {} """再获取price_info_list""" try: self.random_sku_info_list = self._get_random_sku_info_list() # self.lg.info(self.random_sku_info_list) except Exception as e: self.lg.error('出错goods_id: %s' % str(goods_id)) self.lg.exception(e) self.result_data = {} self.random_sku_info_list = [] return {} _tmp_comment_list = [] for current_page in range(1, 4): self.lg.info('------>>>| 正在抓取第 {0} 页的评论...'.format( str(current_page))) _url = 'https://rate.tmall.com/list_detail_rate.htm' params = self._set_params(goods_id=goods_id, seller_id=seller_id, current_page=current_page) self.headers.update({ 'referer': 'https://detail.m.tmall.com/item.htm?id=' + goods_id }) # 原先用代理请求不到数据的原因是没有cookies # body = MyRequests.get_url_body(url=_url, headers=self.headers, params=params, encoding='gbk') # 所以直接用phantomjs来获取相关api数据 _url = _get_url_contain_params(url=_url, params=params) # 根据params组合得到url # self.lg.info(_url) body = self.driver.use_phantomjs_to_get_url_body(url=_url) # self.lg.info(str(body)) if body == '': self.lg.error('获取到的body为空str! 出错type:{0}, goods_id:{1}'.format( str(type), goods_id)) self.result_data = {} return {} try: _ = re.compile('\((.*)\)').findall(body)[0] except IndexError: _ = {} self.lg.error('索引异常! 出错type:{0}, goods_id:{1}'.format( str(type), goods_id)) try: data = json.loads(_).get('rateDetail', {}).get('rateList', []) # pprint(data) except: data = [] self.lg.error( 'json.loads转换_出错! 出错type:{0}, goods_id:{1}'.format( str(type), goods_id)) _tmp_comment_list += data sleep(self.comment_page_switch_sleep_time) try: _comment_list = self._get_comment_list( _tmp_comment_list=_tmp_comment_list) except Exception as e: self.lg.error('出错type:{0}, goods_id:{1}'.format( str(type), goods_id)) self.lg.exception(e) self.result_data = {} return {} _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data