def _craw(self, url, param=None, *args): # while True: res = requests.get(url, params=param, headers=headers) # param["page"] = param["page"] + 1 if res.status_code == 200: # response body_json = res.json() if body_json: res_list = [] for arti in body_json: arti = arti['object']['data'] data = third_post_db.find_by_pt_id( "jianshu-" + str(arti['id']), self.third_id) if data is None: # 构建 post = ThirdPost(self.third_id, self.third_name, 0) post.tags = '' # 顺序 文章id、标题、标签、作者、喜欢数、评论数、跳转url、创建时间 post.post_id = "jianshu-" + str(arti['id']) post.title = arti['title'] post.author = arti['user']['nickname'] post.content = arti['public_abbr'] post.like_num = arti['likes_count'] post.comment_num = arti['public_comments_count'] post.redirect_url = 'https://www.jianshu.com/p/' + \ arti["slug"] post.creatime = arrow.get( arti['first_shared_at']).format( 'YYYY-MM-DD HH:mm:ss') res_list.append(post) log.info("[%s]爬取-> %s %d条记录", self.third_name, url, len(res_list)) self.batch_insert(res_list)
def _craw(self, url, param=None, *args): res = requests.post(url, param) if res.status_code == 200: # html文档 html = res.text soup = BeautifulSoup(html, 'html.parser') # 所有的文章 posts = soup.find_all("div", class_="post_item") res_list = [] for post in posts: p = ThirdPost(self.third_id, self.third_name,0) post_a = post.find("a", class_="titlelnk") # # 跳转路由 p.redirect_url = post_a['href'] # postId p.post_id = re.findall(r"/p/(.+?)\.html", p.redirect_url)[0] # title p.title = post_a.string # 创建时间 p.creatime = post_a.next_sibling.string data = third_post_db.find_by_pt_id(p.post_id, p.third_id) if data is None: res_list.append(p) log.info("[%s]爬取-> %s %d条记录", self.third_name, url, len(res_list)) self.batch_insert(res_list)
def _craw(self, url, param=None, *args): res = requests.post(url, json.dumps(param), headers=headers) if res.status_code == 200: # html文档 body_json = res.json() print(body_json) article_list = body_json['data'] res_list = [] for post in article_list: p = ThirdPost(self.third_id, self.third_name, 0) p.title = post['article_title'] tags = [] for t in post['topic']: tags.append(t['name']) p.tags = ",".join(tags) p.post_id = "infoq-" + post['uuid'] if 'author' in post.keys(): p.author = post['author'][0]['nickname'] else: p.author = "InfoQ" p.content = post['article_summary'] p.redirect_url = "https://www.infoq.cn/article/" + post['uuid'] p.creatime = arrow.get(post['utime'] / 1000).format('YYYY-MM-DD HH:mm:ss') data = third_post_db.find_by_pt_id(p.post_id, p.third_id) if data is None: res_list.append(p) log.info("[%s]爬取-> %s %d条记录", self.third_name, url, len(res_list)) self.batch_insert(res_list)
def _craw(self, url, param=None, *args): res = requests.post(url) if res.status_code == 200: # html文档 html = res.text soup = BeautifulSoup(html, 'html.parser') res_list = [] # 所有的文章 archive = soup.find("div", id="archive") posts = archive.find_all("div", class_="post") for post in posts: p = ThirdPost(self.third_id, self.third_name, 0) post_meta = post.find("div", class_="post-meta") post_a = post_meta.find("a", "meta-title") # 跳转路由 p.redirect_url = post_a['href'] # postId p.post_id = "importnew-" + re.findall(r"m/(.+?)\.html", p.redirect_url)[0] # 标题 p.title = post_a.string # 默认平台名 p.author = self.third_name # 创建时间 p.creatime = post_a.next_sibling.next_sibling.split( "|")[0].strip() # 内容 p.content = post.find("span", class_="excerpt").p.string if p.content is None: p.content = "" data = third_post_db.find_by_pt_id(p.post_id, p.third_id) if data is None: res_list.append(p) log.info("[%s]爬取-> %s %d条记录", self.third_name, url, len(res_list)) self.batch_insert(res_list)
def _craw(self, url, param=None, *args): res = requests.get(url) if res.status_code == 200: # html文档 html = res.text soup = BeautifulSoup(html, "html.parser") # 所有的文章 posts = soup.find_all("div", class_="list_article_item") res_list = [] # 多个文章解析 for post in posts: p = ThirdPost(self.third_id, self.third_name, 0) tip_spans = post.find("div", class_="tip").find_all("span") # postId p.post_id = post['data-id'] # 标题 p.title = post.find("div", class_="title").a.string # 跳转路由 p.redirect_url = host + post.find("div", class_="title").a['href'] # 创建时间 now_year = datetime.datetime.now().year p.creatime = str(now_year) + "-" + list( tip_spans)[2].string.strip() # 作者 p.author = list(tip_spans)[0].string.strip() # 标签 p.tags = args[0] data = third_post_db.find_by_pt_id(p.post_id, p.third_id) if data is None: res_list.append(p) log.info("[%s]爬取-> %s %d条记录", self.third_name, url, len(res_list)) self.batch_insert(res_list)
def _craw(self, url, param=None, *args): res = requests.get(url, param) if res.status_code == 200: # html文档 html = res.text soup = BeautifulSoup(html, 'html.parser') res_list = [] # 所有的文章 posts = soup.find_all("div", class_="article-lwrap") for post in posts: p = ThirdPost(self.third_id, self.third_name,0) post_a = post.find("a", "title") # 标题 p.title = post_a.p.string # 跳转路由 p.redirect_url = host+post_a['href'] p.author = post.find("a", class_='nickName').string.strip() # 创建时间 p.creatime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # postId p.post_id = "/imooc"+post_a['href'] # tags p_skills = post.find("span", class_="skill") p_tags = p_skills.find_all("a") tags = [] for tag in p_tags: tags.append(tag.span.string) p.tags = ",".join(tags) data = third_post_db.find_by_pt_id(p.post_id, p.third_id) if data is None: res_list.append(p) log.info("[%s]爬取-> %s %d条记录", self.third_name, url, len(res_list)) self.batch_insert(res_list)
def _craw(self, url, param=None, *args): res = requests.post(url, json.dumps(param), headers=header) if res.status_code == 200: like_total = args[0] # 至少喜欢的数量 # juejin response body_json = res.json() print(body_json) if body_json['data'] is None: log.error("爬取掘金失败" + body_json['errors']) return article_list = body_json['data']['articleFeed']['items']['edges'] res_list = [] for artiCol in article_list: arti = artiCol['node'] data = third_post_db.find_by_pt_id( arti['id'], self.third_id) if data is None and arti['likeCount'] > like_total: # 大于30喜欢的加入 # 构建 post = ThirdPost(self.third_id, self.third_name, 0) tags = [] for t in arti['tags']: tags.append(t['title']) post.tags = ",".join(tags) # 顺序 文章id、标题、标签、作者、喜欢数、评论数、跳转url、创建时间 post.post_id = arti['id'] post.title = arti['title'] post.author = arti['user']['username'] post.content = arti['content'] post.like_num = arti['likeCount'] post.comment_num = arti['commentsCount'] post.redirect_url = arti['originalUrl'] post.creatime = arrow.get( arti['createdAt']).format('YYYY-MM-DD HH:mm:ss') res_list.append(post) log.info("[%s]爬取-> %s %d条记录", self.third_name, url, len(res_list)) self.batch_insert(res_list)
def _craw(self, url, param=None, *args): res = requests.get(url) if res.status_code == 200: # html文档 htmls = res.text soup = BeautifulSoup(htmls, 'html.parser') news_list = soup.find("div", class_="news-list") # 所有的文章 posts = news_list.find_all("div", class_="news-item") res_list = [] for post in posts: p = ThirdPost(self.third_id, self.third_name, 0) post_title = post.find("h4", "news__item-title") p.title = post_title.text post_href = post.find("a", target="_blank") p.redirect_url = host + post_href['href'] post_author = post.find("span", class_="author") p.author = post_author.a.text # 是SegmentFault不 if p.author == "SegmentFault": continue # 创建时间 p.creatime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # postId p.post_id = "segment-" + post_href['href'] # content p.content = post.find("div", class_="article-excerpt").text.strip() data = third_post_db.find_by_pt_id(p.post_id, p.third_id) if data is None: res_list.append(p) log.info("[%s]爬取-> %s %d条记录", self.third_name, url, len(res_list)) self.batch_insert(res_list)
def _craw(self, url, param=None, *args): res = requests.get(url) if res.status_code == 200: # html文档 htmls = res.text soup = BeautifulSoup(htmls, 'html.parser') # 解析出页码数 total_box = soup.find("div", class_="statistics_t") total = int(total_box.find("span").text) total_page = int(math.ceil(total / 20)) # 一页一页爬取 index = 1 while total_page >= 1 and index <= total_page: article_list = [] param = {"page": index} res = requests.get(url, param) if res.status_code == 200: # htmls文档 htmls = res.text soup = BeautifulSoup(htmls, 'html.parser') detail_list = soup.find( "ul", class_="detail_list").find_all("li") for detail in detail_list: href = detail.find("a")["href"] article_list.append(href) res_list = [] log.info("该专题有%s篇文章", len(article_list)) for i, article in enumerate(article_list): log.info("%s-->%d", article, i + 1) res = requests.get(article) if res.status_code == 200: # htmls文档 htmls = res.text soup = BeautifulSoup(htmls, 'html.parser') p = ThirdPost(self.third_id, self.third_name, self.can_analysis) p.redirect_url = article p.post_id = re.findall(r"/article/(.+)", p.redirect_url)[0] p.title = soup.find("h1", class_="title-article").text p.author = soup.find("a", id="uid").text # 文章内容 a = soup.find("article") # 查看更多和版权信息删除 ar = a.find("div", class_="article-copyright") hide = a.find("div", class_="hide-article-box") if hide is not None: hide.replace_with("") if ar is not None: ar.replace_with("") # url 解决反盗链 imgList = a.find_all("img") for img in imgList: url_str = "https://www.chaoyer.com/api/file/proxy?proxy=https://blog.csdn.net&img=" + str( img["src"]) img["src"] = url_str p.content = html.escape((str(a))) p.creatime = datetime.now().strftime( "%Y-%m-%d %H:%M:%S") data = third_post_db.find_by_pt_id( p.post_id, p.third_id) if data is None: res_list.append(p) log.info("[%s]爬取-> %s %d条记录", self.third_name, url, len(res_list)) self.batch_insert(res_list) index = index + 1