示例#1
0
    def getArticleList(self, num=None):
        """
        :param num: 如果是具体数字则 准备最近发布的num篇文章
        :return: 根据公众号的昵称准备该公众号的所有或者前n篇文章的全部数据 如果某些字段没有就使用默认值
        """
        from pymongo import DESCENDING
        article_list = []
        col = DB(self.ori_nickname)
        if num:
            article_datas = col.table.find().sort('p_date', DESCENDING)()[:num]
        else:
            article_datas = col.get()
        begin_time = time.time()
        for doc in article_datas:
            item = {}
            doc['id'] = doc['content_url']
            for key in l1llll1lll_wcplus_:
                if key in doc:
                    item[key] = doc[key]
                else:
                    item[key] = -2

            article_list.append(item)

        logger.info('解析文章文本用时 %.3f' % (time.time() - begin_time))
        return article_list
示例#2
0
 def prepare_docs(self, num=None):
     """
     :param num: 如果是具体数字则 准备最近发布的num篇文章
     :return: 根据公众号的昵称准备该公众号的所有或者前n篇文章的全部数据 如果某些字段没有就使用默认值
     """
     from pymongo import DESCENDING
     doc_list = []
     # 从数据库中找出文章列表
     col = CollectionOperation(self.nickname_raw)
     if num:
         db_docs = col.table.find().sort("p_date", DESCENDING)()[:num]
     else:
         db_docs = col.get()
     begin_time = time.time()
     # 根据 doc_schema 中 key 构建doc list
     for doc in db_docs:
         item = {}
         doc['id'] = doc['content_url']
         for key in doc_schema:
             if key in doc:
                 item[key] = doc[key]
             # 如果数据库中没有该字段使用-2填充
             else:
                 item[key] = -2
         doc_list.append(item)
     logger.info('解析文章文本用时 %.3f'%(time.time()-begin_time))
     return doc_list
示例#3
0
 def l1l11l111_wcplus_(cls, q=True):
     mac, pt = cls.l1lll1ll1l1_wcplus_()
     if not mac and not pt:
         return False
     else:
         l1lll1l1l11_wcplus_ = cls.l11l111l1_wcplus_()
         l1lll1l1ll1_wcplus_ = l1lll1l1lll_wcplus_()
         if l1lll1l1l11_wcplus_ == 1:
             return False
         if int(mac) not in l1lll1l1ll1_wcplus_:
             if not q:
                 logger.warning('证书错误')
             return False
         end_time = int(pt) - int(mac) - 12874767561234
         l1lll1ll111_wcplus_ = l1lll1l11ll_wcplus_()
         if not l1lll1ll111_wcplus_:
             return False
         l1lll1l11l1_wcplus_ = end_time - l1lll1l11ll_wcplus_()
         if l1lll1l11l1_wcplus_ <= 0:
             if not q:
                 logger.warning('证书过期')
             return False
         l1lll1l1l1l_wcplus_ = datetime.utcfromtimestamp(end_time).strftime(
             '%Y-%m-%d %H:%M:%S')
         if not q:
             logger.info('证书有效至' + l1lll1l1l1l_wcplus_)
         return l1lll1l1l1l_wcplus_
示例#4
0
 def response(self, l1lll1111l1_wcplus_):
     global l11l11llll_wcplus_
     for key in l1lll111111_wcplus_:
         if l1lll111111_wcplus_[key] in l1lll1111l1_wcplus_.request.url:
             l1l1l1111_wcplus_, timestamp = l1lll111lll_wcplus_.l1lll1111ll_wcplus_(
                 l1lll1111l1_wcplus_.request)
             if key == 'home':
                 l11l11llll_wcplus_ = self._1ll1llll1l_wcplus_(
                     l1l1l1111_wcplus_)
             if l11l11llll_wcplus_ == 'UNK':
                 return
             key_name = '%s.%s.req' % (l11l11llll_wcplus_, key)
             l1lll11ll11_wcplus_(key_name, l1l1l1111_wcplus_)
             logger.debug(key_name)
         if key == 'getappmsgext':
             status_code, text = l1lll111lll_wcplus_.get_response(
                 l1lll1111l1_wcplus_.response)
             l1lll11l1ll_wcplus_ = json.loads(text)
             l11l1l111l_wcplus_ = 'UNK'
             if 'nick_name' in l1lll11l1ll_wcplus_:
                 l11l1l111l_wcplus_ = l1lll11l1ll_wcplus_['nick_name']
                 if l11l1l111l_wcplus_ == 'UNK':
                     logger.debug('没能找到微信昵称 换一篇文章点击试试看 确保文章底部阅读数据出现')
                 else:
                     l1lll11ll11_wcplus_(l11l1l111l_wcplus_ + '.nick_name',
                                         l11l11llll_wcplus_)
         elif key == 'home':
             status_code, l1lll11lll1_wcplus_ = l1lll111lll_wcplus_.get_response(
                 l1lll1111l1_wcplus_.response)
             l1lll11l1l1_wcplus_ = l1lll11lll1_wcplus_.split(
                 'var nickname = "')[1].split('" || ""')[0]
             logger.info('准备公众号:' + l1lll11l1l1_wcplus_)
             l1lll11ll11_wcplus_('current_nickname', l1lll11l1l1_wcplus_)
示例#5
0
 def examplePassport(cls, q=True):
     mac, pt = cls.getMacUUid()
     if not mac and not pt:
         return False
     else:
         uuid_ = cls.getUUid()
         mac_uuid = getUuidIp()
         if uuid_ == 1:
             return False
         if int(mac) not in mac_uuid:
             if not q:
                 logger.warning('证书错误')
             return False
         end_time = int(pt) - int(mac) - 12874767561234
         baidu_time = getBaiduTime()
         if not baidu_time:
             return False
         time_left = end_time - getBaiduTime()
         if time_left <= 0:
             if not q:
                 logger.warning('证书过期')
             return False
         end_time = datetime.utcfromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S')
         if not q:
             logger.info('证书有效至' + end_time)
         return end_time
示例#6
0
 def response(self, flow):
     for key in url_filter:
         if url_filter[key] in flow.request.url:
             req_data, timestamp = ExtractFlow.format_request_data(
                 flow.request)
             wxuin = self._extract_wxuin(req_data)
             if wxuin == 'UNK':
                 return
             key_name = '%s.%s.req' % (wxuin, key)
             insert_helper(key_name, req_data)
             logger.debug(key_name)
         if key == 'getappmsgext':
             status_code, text = ExtractFlow.get_response(flow.response)
             text_dict = json.loads(text)
             nick_name = 'UNK'
             if 'nick_name' in text_dict:
                 nick_name = text_dict['nick_name']
                 if nick_name == 'UNK':
                     logger.debug('没能找到微信昵称 换一篇文章点击试试看 确保文章底部阅读数据出现')
                 else:
                     insert_helper(nick_name + '.nick_name', wxuin)
         elif key == 'home':
             status_code, html_text = ExtractFlow.get_response(
                 flow.response)
             current_nickname = html_text.split(
                 'var nickname = "')[1].split('" || ""')[0]
             logger.info('准备公众号:' + current_nickname)
             insert_helper('current_nickname', current_nickname)
示例#7
0
 def check_password(cls, q=True):
     """
     :param pt:真实mac+15618407030+截止日期timestamp
     :param mac:用户提供的mac
     :return:返回证书是否有效如果有效 无效直接False 有效返回截止日期 日期为字符串格式
     """
     return '2099-12-31 00:00:00'
     mac, pt = cls.read_password()
     if not mac and not pt:
         return False
     else:
         your_mac = cls.get_mac_address()
         your_macs, _ = get_uuind()
         if your_mac == 1:
             return False
         if int(mac) not in your_macs:
             if not q:
                 logger.warning('证书错误')
             return False
         end_time = int(pt) - int(mac) - 18058584888
         net_time = get_internet_time()
         if not net_time:
             return False
         left_seconds = end_time - get_internet_time()
         if left_seconds <= 0:
             if not q:
                 logger.warning('证书过期')
             return False
         end_time_str = datetime.utcfromtimestamp(end_time).strftime(
             '%Y-%m-%d %H:%M:%S')
         if not q:
             logger.info('证书有效至' + end_time_str)
         return end_time_str
示例#8
0
    def l1llll1l1l_wcplus_(self, num=None):
        """
        :param num: 如果是具体数字则 准备最近发布的num篇文章
        :return: 根据公众号的昵称准备该公众号的所有或者前n篇文章的全部数据 如果某些字段没有就使用默认值
        """
        from pymongo import DESCENDING
        l1llll1l11_wcplus_ = []
        col = l1l11llll_wcplus_(self.l1lll1lll1_wcplus_)
        if num:
            l11111l1l_wcplus_ = col.table.find().sort('p_date',
                                                      DESCENDING)()[:num]
        else:
            l11111l1l_wcplus_ = col.get()
        begin_time = time.time()
        for doc in l11111l1l_wcplus_:
            item = {}
            doc['id'] = doc['content_url']
            for key in l1llll1lll_wcplus_:
                if key in doc:
                    item[key] = doc[key]
                else:
                    item[key] = -2

            l1llll1l11_wcplus_.append(item)

        logger.info('解析文章文本用时 %.3f' % (time.time() - begin_time))
        return l1llll1l11_wcplus_
示例#9
0
    def response(self, flow):
        # 检查是否是过滤器中的url
        for key in url_filter:
            if url_filter[key] in flow.request.url:
                # 请求参数进行格式转化 得到请求参数和时间戳
                req_data,timestamp = ExtractFlow.format_request_data(flow.request)
                # debug_p(req_data)
                global wxuin
                if key == 'home':
                    wxuin = self._extract_wxuin(req_data)
                # 没有获取到微信昵称不保存参数
                if wxuin == 'UNK':
                    return
                key_name = '%s.%s.req'%(wxuin, key)
                insert_helper(key_name,req_data)
                logger.debug(key_name)
                # 获取当前微信的昵称和微信的wxuin 以此支持多微信同时采集
                if key == 'getappmsgext':
                    # 找出当前微信昵称
                    status_code,text = ExtractFlow.get_response(flow.response)
                    text_dict = json.loads(text)
                    nick_name = 'UNK'
                    if 'nick_name' in text_dict:
                        nick_name = text_dict['nick_name']
                        if nick_name == 'UNK':
                            logger.debug('没能找到微信昵称 换一篇文章点击试试看 确保文章底部阅读数据出现')
                        else:
                            insert_helper(nick_name+'.nick_name', wxuin)

                elif key == 'home':
                    status_code, html_text = ExtractFlow.get_response(flow.response)
                    current_nickname = html_text.split('var nickname = "')[1].split('" || ""')[0]
                    logger.info('准备公众号:'+current_nickname)
                    insert_helper('current_nickname',current_nickname)
示例#10
0
 def run(self, command, cmd_args):
     from app.api.gzh import Finished
     from cmp.db.mongo import CollectionOperation
     for gzh in Finished().get()['finished']:
         col = CollectionOperation(gzh['nickname'])
         articles_buffer = []
         for a in col.get():
             if type(a['mov']) == str:
                 a['mov'] = int(a['mov'])
                 articles_buffer.append(a)
         col.insert('id', articles_buffer)
         logger.info('转化完成 %d %s'%(len(articles_buffer), gzh['nickname']))
         command_log('转化完成 %d %s'%(len(articles_buffer), gzh['nickname']))
示例#11
0
 def print_ts_brief():
     global front_process
     ts = TaskRecoder.get_ts()
     TaskRecoder.put_ts(ts)
     item = {}
     item['style'] = '采集正文'
     item['nickname'] = '略'
     item['process'] = '%d/%d' % (ts['done_task_num'], ts['total_task_num'])
     item['data'] = '速度%.3f' % ts['speed']
     item['task'] = '略'
     logger.info('速度%.3f 完成%d/%d' %
                 (ts['speed'], ts['done_task_num'], ts['total_task_num']))
     front_process.new_article(ts['done_task_num'], ts['total_task_num'],
                               ts['ips'], ts['speed'])
示例#12
0
def l1l1ll1lll_wcplus_(l1l1l1l1l1_wcplus_=200):
    """
    :return: article_data_buffer当到达一定长度之后 保存并清空
    """
    global l1l1l1111l_wcplus_
    success = False
    if len(l1l1l1111l_wcplus_) >= l1l1l1l1l1_wcplus_:
        while not success:
            try:
                l11ll111l_wcplus_.insert(id, l1l1l1111l_wcplus_)
                l1l1l1111l_wcplus_ = []
                logger.info('保存成功保存%d' % l1l1l1l1l1_wcplus_)
                success = True
            except:
                time.sleep(3)
                l1l1ll1lll_wcplus_(l1l1l1l1l1_wcplus_=l1l1l1l1l1_wcplus_)
示例#13
0
 def l1l1l11lll_wcplus_():
     global l1ll1l11ll_wcplus_
     ts = l1l1ll1l11_wcplus_.l1ll11ll1l_wcplus_()
     l1l1ll1l11_wcplus_.l1l11l1ll1_wcplus_(ts)
     item = {}
     item['style'] = '采集正文'
     item['nickname'] = '略'
     item['process'] = '%d/%d' % (ts['done_task_num'], ts['total_task_num'])
     item['data'] = '速度%.3f' % ts['speed']
     item['task'] = '略'
     logger.info('速度%.3f 完成%d/%d' %
                 (ts['speed'], ts['done_task_num'], ts['total_task_num']))
     if l1ll1l11ll_wcplus_:
         l1ll1l11ll_wcplus_.l11l1l1ll_wcplus_(ts['done_task_num'],
                                              ts['total_task_num'],
                                              ts['ips'], ts['speed'])
示例#14
0
def save_article(windos=200):
    """
    :return: article_data_buffer当到达一定长度之后 保存并清空
    """
    global article_data_buffer
    success = False
    if len(article_data_buffer) >= windos:
        while not success:
            try:
                col_data.insert(id, article_data_buffer)
                article_data_buffer = []
                logger.info('保存成功保存%d' % windos)
                success = True
            except:
                time.sleep(3)
                save_article(windos=windos)
示例#15
0
 def _insert(use_data, item, p_date):
     """
     文章列表信息插入use_data
     """
     use_data['index'] += 1
     keys = ('title', 'author', 'content_url', 'digest', 'cover',
             'source_url', 'mov', 'nickname')
     sub_data = DecodeArticleList.sub_dict(item, keys)
     p_date = datetime.fromtimestamp(p_date)
     sub_data['p_date'] = p_date
     sub_data['id'] = get_md5(sub_data['content_url'])
     if sub_data['title']:
         use_data['data'].append(sub_data)
     item = {}
     item['style'] = '采集文章列表'
     item['nickname'] = sub_data['nickname']
     item['process'] = use_data['index']
     item['data'] = sub_data['mov']
     item['task'] = sub_data['title'][:5] + '...'
     logger.info('采集文章列表中... %2d %2s %s' %
                 (use_data['index'], sub_data['mov'], sub_data['title']))
示例#16
0
 def _insert(l11lll1lll_wcplus_, item, l1l1111111_wcplus_):
     """
     文章列表信息插入use_data
     """
     l11lll1lll_wcplus_['index'] += 1
     keys = ('title', 'author', 'content_url', 'digest', 'cover',
             'source_url', 'mov', 'nickname')
     l11lll11ll_wcplus_ = l11lll11l1_wcplus_.l11llll111_wcplus_(item, keys)
     l1l1111111_wcplus_ = datetime.fromtimestamp(l1l1111111_wcplus_)
     l11lll11ll_wcplus_['p_date'] = l1l1111111_wcplus_
     l11lll11ll_wcplus_['id'] = l11llll11_wcplus_(
         l11lll11ll_wcplus_['content_url'])
     l11lll11ll_wcplus_['mov'] = int(l11lll11ll_wcplus_['mov'])
     if l11lll11ll_wcplus_['title']:
         l11lll1lll_wcplus_['data'].append(l11lll11ll_wcplus_)
     item = {}
     item['style'] = '采集文章列表'
     item['nickname'] = l11lll11ll_wcplus_['nickname']
     item['process'] = l11lll1lll_wcplus_['index']
     item['data'] = l11lll11ll_wcplus_['mov']
     item['task'] = l11lll11ll_wcplus_['title'][:5] + '...'
     logger.info('采集文章列表中... %2d %2s %s' %
                 (l11lll1lll_wcplus_['index'], l11lll11ll_wcplus_['mov'],
                  l11lll11ll_wcplus_['title']))
示例#17
0
 def _insert(use_data, item, p_date):
     '''
     文章列表信息插入use_data
     '''
     use_data['index'] += 1
     keys = ('title', 'author', 'content_url', 'digest', 'cover', 'source_url','mov','nickname')
     sub_data = DecodeArticleList.sub_dict(item, keys)
     p_date = datetime.fromtimestamp(p_date)
     sub_data["p_date"] = p_date
     # 设置id为文章的url 方便使用不同的数据库操作
     sub_data["id"] = get_md5(sub_data['content_url'])
     # mov转化为整数
     sub_data['mov'] = int(sub_data['mov'])
     # 只保留有文章标题的文章 没有标题的文章已经被删除
     if sub_data["title"]:
         use_data['data'].append(sub_data)
     # 准备任务日志信息 需要答应
     item = {}
     item['style'] = '采集文章列表'
     item['nickname'] = sub_data['nickname']
     item['process'] = use_data["index"]
     item['data'] = sub_data["mov"]
     item['task'] = sub_data["title"][:5]+'...'
     logger.info('采集文章列表中... %2d %2s %s'%(use_data["index"],sub_data["mov"], sub_data["title"]))
示例#18
0
 def l11l1llll1_wcplus_(rd):
     logger.info('采集阅读数据中... 阅读%-5d 点赞%-4d 赞赏%3d 评论%d' % (rd['read_num'], rd['like_num'], rd['reward_num'], rd['comment_num']))
示例#19
0
 def l1l11ll11l_wcplus_():
     ts = l1l1ll1l11_wcplus_.l1ll11ll1l_wcplus_()
     l1l1ll1l11_wcplus_.l1l11l1ll1_wcplus_(ts)
     logger.info(ts)
示例#20
0
 def print_ts():
     ts = TaskRecoder.get_ts()
     TaskRecoder.put_ts(ts)
     logger.info(ts)
示例#21
0
 def print_log(rd):
     logger.info('采集阅读数据中... 阅读%-5d 点赞%-4d 赞赏%3d 评论%d' %
                 (rd['read_num'], rd['like_num'], rd['reward_num'],
                  rd['comment_num']))