Exemplo n.º 1
0
 def _craw(self, url, param=None, *args):
     # while True:
     res = requests.get(url, params=param, headers=headers)
     # param["page"] = param["page"] + 1
     if res.status_code == 200:
         # response
         body_json = res.json()
         if body_json:
             res_list = []
             for arti in body_json:
                 arti = arti['object']['data']
                 data = third_post_db.find_by_pt_id(
                     "jianshu-" + str(arti['id']), self.third_id)
                 if data is None:
                     # 构建
                     post = ThirdPost(self.third_id, self.third_name, 0)
                     post.tags = ''
                     # 顺序 文章id、标题、标签、作者、喜欢数、评论数、跳转url、创建时间
                     post.post_id = "jianshu-" + str(arti['id'])
                     post.title = arti['title']
                     post.author = arti['user']['nickname']
                     post.content = arti['public_abbr']
                     post.like_num = arti['likes_count']
                     post.comment_num = arti['public_comments_count']
                     post.redirect_url = 'https://www.jianshu.com/p/' + \
                         arti["slug"]
                     post.creatime = arrow.get(
                         arti['first_shared_at']).format(
                             'YYYY-MM-DD HH:mm:ss')
                     res_list.append(post)
             log.info("[%s]爬取-> %s  %d条记录", self.third_name, url,
                      len(res_list))
             self.batch_insert(res_list)
Exemplo n.º 2
0
    def _craw(self, url, param=None, *args):
        res = requests.get(url, param, headers=headers)
        if res.status_code == 200:

            # html文档
            html = res.text
            soup = BeautifulSoup(html, "html.parser")
            # 所有的文章
            posts = soup.find_all("div", class_="post")

            res_list = []
            # 多个文章解析
            for post in posts:
                p = ThirdPost(self.third_id, self.third_name, 0)
                post_content = post.find("div", class_="content")
                meta = post_content.find("div", class_="meta")
                if param == None:
                    user_info = post.find("div", class_="user-info").find(
                        "div", class_="info")
                    # 作者
                    p.author = user_info.h4.text
                    # postId
                    p.post_id = post_content.p.a["href"]
                    # 内容
                    p.content = post_content.p.a.string
                else:
                    # 作者
                    author_wrap = soup.find("div", class_="m-b").h3.text
                    p.author = author_wrap
                    # postId
                    p.post_id = post_content["data-url"]
                    p.content = ""
                # 标题
                p.title = post_content.h3.a.string
                # 赞数量
                p.like_num = post.find(
                    "a", class_="like-button").find("span").string
                # 评论数量
                p.comment_num = list(meta.find("span").stripped_strings)[0]
                # 跳转路由
                p.redirect_url = host + post_content.h3.a['href']

                data = third_post_db.find_by_pt_id(p.post_id, p.third_id)
                if data is None:
                    res_list.append(p)
            log.info("[%s]爬取-> %s  %d条记录", self.third_name, url, len(res_list))
            self.batch_insert(res_list)
Exemplo n.º 3
0
    def _craw(self, url, param=None, *args):
        res = requests.post(url, json.dumps(param), headers=header)
        if res.status_code == 200:
            like_total = args[0]  # 至少喜欢的数量
            # juejin response
            body_json = res.json()
            print(body_json)
            if body_json['data'] is None:
                log.error("爬取掘金失败" + body_json['errors'])
                return
            article_list = body_json['data']['articleFeed']['items']['edges']

            res_list = []
            for artiCol in article_list:

                arti = artiCol['node']

                data = third_post_db.find_by_pt_id(
                    arti['id'], self.third_id)

                if data is None and arti['likeCount'] > like_total:  # 大于30喜欢的加入
                    # 构建
                    post = ThirdPost(self.third_id, self.third_name, 0)
                    tags = []
                    for t in arti['tags']:
                        tags.append(t['title'])
                    post.tags = ",".join(tags)
                    # 顺序 文章id、标题、标签、作者、喜欢数、评论数、跳转url、创建时间
                    post.post_id = arti['id']
                    post.title = arti['title']
                    post.author = arti['user']['username']
                    post.content = arti['content']
                    post.like_num = arti['likeCount']
                    post.comment_num = arti['commentsCount']
                    post.redirect_url = arti['originalUrl']
                    post.creatime = arrow.get(
                        arti['createdAt']).format('YYYY-MM-DD HH:mm:ss')

                    res_list.append(post)
            log.info("[%s]爬取-> %s  %d条记录", self.third_name, url, len(res_list))
            self.batch_insert(res_list)