def get_class_recipes(class_url, max_page=20, sleep=0.1): """获取某个菜谱分类url下的所有菜谱url""" class_url = class_url + "?page={page}" recipes = dict() # 暴力爬取方案,每个菜谱分类请求100页 for page in range(1, max_page): time.sleep(sleep) url = class_url.format(page=page) print("current url: ", url) response = requests.get(url, headers=get_header()) html = BeautifulSoup(response.text, "lxml") # 获取本页菜谱 menus = html.find( "div", {'class': 'new-menu-list search-menu-list clearfix mt10'}) if menus: print("get recipes fail: ", url) menus = menus.find_all('a') for m in menus: name = re.sub("\n| ", "", m.text) recipe_url = urljoin(HOME_URL, m['href']) recipes[name] = recipe_url # 判断是否是最后一页 # next_page = html.find('div', {'class': 'paging mt20'}).text # if "下一页" in next_page: # page += 1 # else: # break return recipes
def http_requests(url): response = requests.get(url, headers=get_header()) if response.status_code == 200: html = BeautifulSoup(response.text, 'lxml') return html else: raise ValueError("返回状态码(%s)不是200,url采集失败" % str(response.status_code))
def get_plate_shares(plate_code, kind='gn'): """从同花顺网站获取板块中所有股票的行情 :param plate_code: str 板块代码,可以在这里查找对应的板块代码:http://q.10jqka.com.cn/gn/ :param kind: str 板块类型,可选值:['gn', 'thshy'] gn 概念板块 thshy 同花顺行业板块 :return: pd.DataFrame ['序号', '代码', '名称', '现价', '涨跌幅(%)', '涨跌', '涨速(%)', '换手(%)', '量比', '振幅(%)', '成交额', '流通股', '流通市值', '市盈率'] """ url_template = "http://q.10jqka.com.cn/{kind}/detail/order/desc/" \ "page/{page}/ajax/1/code/{code}" kind_values = ['gn', 'thshy'] if kind not in kind_values: raise ValueError("kind参数的取值必须在 %s 中" % kind_values) i = 1 results = [] session = HTMLSession() while 1: url = url_template.format(kind=kind, page=i, code=plate_code) response = session.get(url, headers=get_header()) response.html.render() html = BeautifulSoup(response.html.text, 'lxml') # table = html.find('table', {'class': "m-table m-pager-table"}).text.strip() table = html.text.strip() cells = table.split("\n")[:-1] total_pages = int(table.split("\n")[-1].split("/")[1]) del cells[14] col_nums = 14 row_nums = int(len(cells) / col_nums) col_names = [x.strip() for x in cells[0:14]] for x in range(1, row_nums): results.append(cells[x * col_nums + 2:(x + 1) * col_nums - 1]) # 尾页退出 if i > total_pages: break else: i += 1 return pd.DataFrame(results, columns=col_names)
def get_top_portfolio(market='cn', profit="monthly_gain", count=30): """获取top实盘组合 :param market: :param profit: :param count: :return: """ base_url = "https://xueqiu.com/cubes/discover/rank/cube/list.json?" \ "category=12&count={count}&market={market}&profit={profit}&sort=best_benefit" url = base_url.format(market=market, count=count, profit=profit) headers = get_header() sess = requests.session() sess.get(XUEQIU_HOME, headers=headers) res = sess.get(url, headers=headers).json()['list'] top_pfs = [] for r in res: pf = { "name": r['name'], "symbol": r['symbol'], "description": r['description'], "follower_count": r['follower_count'], "updated_at": r['updated_at'], "net_value": r['net_value'], "monthly_gain": str(r['monthly_gain']) + "%", "total_gain": str(r['annualized_gain_rate']) + "%", "last_rb_id": r['last_rb_id'], } user = { "id": r['owner']['id'], "city": r['owner']['city'], "description": r['owner']['description'], "followers_count": r['owner']['followers_count'], "friends_count": r['owner']['friends_count'], "gender": r['owner']['gender'], "nick_name": r['owner']['screen_name'], "province": r['owner']['province'], "status_count": r['owner']['status_count'], } pf['user'] = user top_pfs.append(pf) return top_pfs
def get_all_classify(): """获取全部菜谱分类""" url = "https://www.xinshipu.com/%E8%8F%9C%E8%B0%B1%E5%A4%A7%E5%85%A8.html" response = requests.get(url, headers=get_header()) html = BeautifulSoup(response.text, "lxml") all_a = html.find("div", { 'class': "detail-cate-list clearfix mt20" }).find_all('a') classify = dict() for a in all_a: if a.has_attr('rel') and not a.has_attr('class'): class_url = urljoin(HOME_URL, a['href']) classify[a.text] = class_url return classify
def get_plate_fund_flow(kind): """获取同花顺最新的行业/概念资金流 :param kind: str 指明需要获取的资金流类型,可选值 ['hyzjl', 'gnzjl'] hyzjl - 行业资金流 gnzil - 概念资金流 :return: pd.DataFrame ['序号', '行业', '行业指数', '涨跌幅', '流入资金(亿)', '流出资金(亿)', '净额(亿)', '公司家数', '领涨股', '涨跌幅', '当前价(元)'] """ if kind not in ['hyzjl', 'gnzjl']: raise ValueError("kind 必须是 ['hyzjl', 'gnzjl'] 中的一个值") url_template = "http://data.10jqka.com.cn/funds/{kind}/field/" \ "tradezdf/order/desc/page/{page}/ajax/1/" i = 1 results = [] session = HTMLSession() while 1: url = url_template.format(page=i, kind=kind) response = session.get(url, headers=get_header()) response.html.render() html = BeautifulSoup(response.html.text, 'lxml') # table = html.find('table', {'class': "m-table J-ajax-table"}).text.strip() table = html.text.strip() cells = table.split("\n")[:-2] total_pages = int(table.split("\n")[-2].split("/")[1]) col_nums = 11 row_nums = int(len(cells) / col_nums) col_names = cells[0:11] for x in range(1, row_nums): results.append(cells[x * col_nums:(x + 1) * col_nums]) # 尾页退出 if i >= total_pages: break else: i += 1 return pd.DataFrame(results, columns=col_names)
def _get_sub_comments(comment_id): """获取评论下面的子评论 :param str comment_id: 评论id,如 `106580772` :return: list sub_comments """ sub_com_url = "https://xueqiu.com/statuses/comments.json?id={comment_id}" \ "&count=20&page=1&reply=true&asc=false&type=status&split=true" url = sub_com_url.format(comment_id=comment_id) headers = get_header() sess = requests.Session() # 访问首页,获取cookies sess.get(XUEQIU_HOME, headers=headers, timeout=10) res = sess.get(url, headers=headers).json()['comments'] sub_comments = [] for r in res: com = { "timestamp": r['created_at'], "ip": r['created_ip'], "text": BeautifulSoup(r['text'], 'lxml').text, "source": r['source'], } user = { "id": r['user']['id'], "city": r['user']['city'], "description": r['user']['description'], "followers_count": r['user']['followers_count'], "friends_count": r['user']['friends_count'], "gender": r['user']['gender'], "nick_name": r['user']['screen_name'], "province": r['user']['province'], "status_count": r['user']['status_count'], } com["user"] = user sub_comments.append(com) return sub_comments
def zao_pan(): """获取同花顺早盘必读信息""" url = "http://stock.10jqka.com.cn/zaopan/" response = requests.get(url, headers=get_header()) html = BeautifulSoup(response.text, 'lxml') # 名人名言 wisdom = html.find('div', {'class': "select-content"}).text.strip() # 昨日收盘 yesterday = html.find('div', {'class': 'yestoday'}).text.strip() yesterday = yesterday.replace("  ", "|") # 今日关注 content = html.find('div', {'class': "content-main-fl fl"}).text.strip() content = re.sub('[ \u3000]', "\n", content) res = [wisdom, yesterday, content] return "\n\n".join(res)
def get_article_list(self, d=None): """获取首页的头条文章列表""" html = requests.get(self.home_url, headers=get_header()) bsobj = BeautifulSoup(html.content.decode('utf-8'), 'lxml') a_list = [] for a in bsobj.find_all("a"): try: url = a['href'] title = a.text.strip() date_ = self._get_date_from_url(url) a_list.append([url, title, date_]) except: if tma.DEBUG: traceback.print_exc() continue a_list = [ a for a in a_list if a[0] != "" and a[0].strip("/") != "http://xhgy.xinhuanet.com" and a[0].startswith("http") and a[1] != "" and a[1] != "视频MP4地址" and "c_" in a[0] and a[2] != "" # and 'photo' not in a[0] # and 'video' not in a[0] ] # 根据url去重 df = pd.DataFrame(a_list, columns=['url', 'title', 'date']) df.drop_duplicates('url', inplace=True) res = [list(x) for x in list(df.values)] if d is None: date_list = [datetime.now().date().__str__()] else: date_list = d res = [a for a in res if a[2] in date_list] res = sorted(res, key=lambda x: x[2], reverse=True) return res
def get_article_detail(article_url): """获取新华网article_url中的文章内容 :param article_url: 文章url :return: { "url": article_url, "title": title, "pub_time": pub_time, "source": source, "content": content } """ # article_url = "http://www.xinhuanet.com/fortune/2018-06/20/c_129897476.htm" html = requests.get(article_url, headers=get_header()) bsobj = BeautifulSoup(html.content.decode('utf-8'), 'lxml') # 解析标题 cols = bsobj.find('div', {"class": "h-news"}).text.strip().split("\r\n") title = cols[0].strip() pub_time = cols[1].strip() source = cols[-1].strip() # 解析内容 content = bsobj.find('div', {"id": "p-detail"}).text.strip() content = content.replace("\u3000\u3000", "") content = [x.strip() for x in content.split("\n")] content = [x for x in content if x != ""] content = "\n".join(content) return { "url": article_url, "title": title, "pub_time": pub_time, "source": source, "content": content }
def get_comments(code, sleep=1): """获取股票code的雪球评论 :param str code: 股票代码,如 `600122` :param float sleep: 睡眠时间, 默认值为 1 :return: """ headers = get_header() sess = requests.Session() # 访问首页,获取cookies sess.get(XUEQIU_HOME, headers=headers, timeout=10) # 获取首页评论 symbol = make_symbol(code) real_time = str(time.time()).replace('.', '')[0:-1] # 获取当前时间 comment_url = 'https://xueqiu.com/statuses/search.json?' \ 'count=10&comment=0&symbol={symbol}&hl=0&' \ 'source=user&sort=time&page={page}&_={real_time}' url = comment_url.format(symbol=symbol, page=1, real_time=real_time) res = sess.get(url, headers=headers, timeout=10).json() count = res['count'] # 评论总条数 total_page = res['maxPage'] # 页数 print("总页数:", total_page) # 评论数据存储list comments = { "symbol": symbol, "count": count, } coms = [] for i in range(1, total_page + 1): print(i) time.sleep(sleep) headers = get_header() sess_temp = requests.Session() # 访问首页,获取cookies sess_temp.get(XUEQIU_HOME, headers=headers, timeout=10) real_time = str(time.time()).replace('.', '')[0:-1] # 获取当前时间 url = comment_url.format(symbol=symbol, page=i, real_time=real_time) try: res = sess_temp.get(url, headers=headers, timeout=10).json()['list'] except Exception: print(i, "fail") traceback.print_exc() continue for r in res: com = { "text": BeautifulSoup(r['text'], 'lxml').text, "id": r['id'], "time": r['timeBefore'], "reply_count": int(r['reply_count']), "source": r['source'] } user = { "id": r['user']['id'], "city": r['user']['city'], "description": r['user']['description'], "followers_count": r['user']['followers_count'], "friends_count": r['user']['friends_count'], "gender": r['user']['gender'], "nick_name": r['user']['screen_name'], "province": r['user']['province'], "status_count": r['user']['status_count'], } com['user'] = user if com['reply_count'] > 0: com['sub_comments'] = _get_sub_comments(com['id']) else: com['sub_comments'] = [] coms.append(com) comments['comment_list'] = coms return comments
def get_recipe_detail(recipe_url): """从url中获取菜谱详细信息 :param recipe_url: str 菜谱url,如:https://www.xinshipu.com/zuofa/598775; https://www.xinshipu.com//zuofa/749342 :return:dict """ response = requests.get(recipe_url, headers=get_header()) html = BeautifulSoup(response.text, 'lxml') # 获取菜名 name = html.find("div", {"class": "re-up"}).h1.text # 主图 img = html.find("div", {"class": "gallery"}).a['href'] img = urljoin(HOME_URL, img) all_info = html.find_all("div", {"class": "dd"}) if len(all_info) == 4: # 简介 intro = re.sub('\n|\t|\r| ', '', all_info[0].text) material_i = 1 method_i = 2 else: intro = None material_i = 0 method_i = 1 # 食材 material = all_info[material_i].text.strip() material = re.sub('\r\n|\r\n \n|\n\n\n', '\n', material) # 做法 try: method_steps = html.find("ol", { "class": "re-step-wpic" }).find_all('li') method = [] for i, m in enumerate(method_steps, 1): step = dict(step_num=i) step['text'] = m.text.strip() if m.img: step['img_url'] = urljoin(HOME_URL, m.img['src']) method.append(step) except: method = all_info[method_i].text.strip() method = re.sub('\r\n|\r\n \n|\n\n\n\n', '\n', method) # 相关菜品 classify = all_info[-1].text.strip() if '\xa0\xa0' in classify: classify = classify.replace('\xa0\xa0', ' | ') else: classify = "" return { "name": name, "url": recipe_url, "img": img, "intro": intro, "material": material, "method": method, "classify": classify }
def get_website_map(): wzdt_url = "http://www.xinhuanet.com/wzdt2014.htm" html = requests.get(wzdt_url, headers=get_header()) bsobj = BeautifulSoup(html.content.decode('utf-8'), 'lxml') map_raw = bsobj.find('div', {'class': "content_left"}) raise NotImplementedError