Пример #1
0
 def __init__(self, nickname):
     """
     :param nickname: 公众号名称
     """
     self.nickname = nickname
     # 生成器转化成为list 赋值给self.posts
     self.posts = []
     self.total_num = 0
     self.crawled_num = 0
     self.uncrawled_num = 0
     for p in get_collection_article(nickname):
         # 过滤掉没有爬取阅读数据的文章
         if 'read_num' in p:
             self.posts.append(p)
             self.crawled_num += 1
         else:
             self.uncrawled_num += 1
         self.total_num += 1
     #
     if self.crawled_num != 0:
         self.gzh = GZH(self.posts)
         self.gzh.postsToDataframe()
         # echarts的option_data
         self.option_data = {}
         # 公众号名称
         self.option_data['account_name'] = nickname
         # 累计发文总数
         self.option_data['posts_info'] = ' 有效文章%d 其中%d具有阅读数据 还剩%d尚无阅读数据'%\
                                               (self.total_num,self.crawled_num,self.uncrawled_num)
Пример #2
0
def get_all_articles_data(nickname):
    """
    :param nickname:公众号昵称
    :return:一个公众号的全部文章列表
    {'title':'公众号名称','articles':[{},{},{}]}
    """
    use_keys = [
        'article_id', 'p_date', 'read_num', 'like_num', 'reward_num',
        'comment_num', 'author', 'mov', 'title', 'content_url'
    ]
    data = {}
    data['title'] = nickname
    data['articles'] = []
    articles = get_collection_article(nickname)
    id_counter = 0
    for article in articles:
        if 'title' not in article:
            continue
        id_counter += 1
        use_data = {}
        # use_data = dict((k, article[k]) for k in use_keys)
        for k in use_keys:
            if k in article:
                use_data[k] = article[k]
            else:
                use_data[k] = '-'
        # 发文时间格式转化
        use_data['p_date'] = use_data['p_date'].strftime("%Y/%m/%d")
        use_data['article_id'] = id_counter
        data['articles'].append(use_data)
    return data
Пример #3
0
 def __init__(self, *args, **kwargs):
     """
     :param args:
     :param kwargs:
     实例化爬虫需要调用的函数
     """
     # 包含当前公众号所有不存在文本内容数据的生成器
     self.current_nickname = TidyReqData.get_nickname()
     self.articles_list = get_collection_article(self.current_nickname,
                                                 article={"$exists": False},
                                                 title={"$exists": True})
     self.crawler_begin_time = time()
     self.crawler_parse_counter = 0
Пример #4
0
 def __init__(self, *args, **kwargs):
     """
     :param args:
     :param kwargs:
     实例化爬虫需要调用的函数
     """
     # 包含当前公众号所有不存在文本内容数据的生成器
     self.current_nickname = TidyReqData.get_nickname()
     print(self.current_nickname)
     articles_list = get_collection_article(self.current_nickname,
                                            read_num={"$exists": False},
                                            comment_id={"$exists": True})
     self.articles_list = []
     for article in articles_list:
         self.articles_list.append(article)
     self.task_num = len(self.articles_list)
     self.task_counter = 0
     self.begin_time = time()
     self.pre_time = time()
Пример #5
0
 def index_db_docs(self, nickname):
     """
     :param nickname:公众号昵称
     :return: 从mongodb中获取一个公众号的全部数据 使用bulk操作index进入es
     """
     # 先创建index
     index_name = self.create_index(nickname)
     # 从数据库中获取该公众号的全部文章
     articles = get_collection_article(nickname,
                                       article={"$exists": True},
                                       title={"$exists": True})
     articles_cache = []
     # mongodb的连接10分钟之后会过期 期间可能会出现完不成index的情况 故先将公众号的全部历史文章缓存
     for article in articles:
         doc = dict((key, article[key]) for key in doc_schema)
         articles_cache.append(doc)
     # 使用bulk操作index文档
     result = self.index_bulk(index_name, articles_cache)
     return result