示例#1
0
 def run(self):
     try:
         urls_article = [
             'https://mp.weixin.qq.com/s?src=11&timestamp=1541559601&ver=1229&signature=ixTsG-RvK8H58t6D-CpW6olWI8hA52Wz-FRb12ZcrNG-lxR20YutoyLYUr-RB3w8WHjE1petjDcbbxZVxTChvPWM27qszWu0Z3zonjx8SEQB5mmgm1O9Eu*5qsFhnBCH&new=1'
         ]
         entity = None
         backpack_list = []
         ftp_list = []
         ftp_info = None
         for page_count, url in enumerate(urls_article):
             # if page_count < 15:
             #     continue
             html = requests.get(url)
             # 确定account信息
             name = pq(html.text)('#js_name').text()
             account_name = pq(
                 html.text)('.profile_meta_value').eq(0).text()
             log('---{}---{}---'.format(name, account_name))
             account = Account()
             account.name = name
             account.account = account_name
             account.get_account_id()
             article = Article()
             try:
                 article.create(url, account)
             except RuntimeError as run_error:
                 log('找不到浏览器 {}'.format(run_error))
             log('第{}条 文章标题: {}'.format(page_count, article.title))
             log("当前文章url: {}".format(url))
             entity = JsonEntity(article, account)
             log('当前文章ID: {}'.format(entity.id))
             # if entity.id in ids:
             #     log('当前文章已存在,跳过')
             #     continue
             backpack = Backpack()
             backpack.create(entity)
             backpack_list.append(backpack.create_backpack())
             # self.save_to_mysql(entity)
             self.save_to_mongo(entity.to_dict())
             # ftp包
             ftp_info = Ftp(entity)
             name_xml = ftp_info.hash_md5(ftp_info.url)
             log('当前文章xml: {}'.format(name_xml))
             self.create_xml(ftp_info.ftp_dict(), name_xml)
             ftp_list.append(name_xml)
             # if page_count >= 3:
             #     break
         log("发包")
         # todo 发包超时,修改MTU
         if ftp_info is not None:
             entity.uploads_ftp(ftp_info, ftp_list)
         if entity:
             # entity.uploads(backpack_list)
             entity.uploads_datacenter_relay(backpack_list)
             entity.uploads_datacenter_unity(backpack_list)
         log("发包完成")
     except Exception as e:
         log("解析公众号错误 {}".format(e))
         if 'chrome not reachable' in str(e):
             raise RuntimeError('chrome not reachable')
示例#2
0
 def run(self):
     count = 0
     while True:
         count += 1
         log.info('第{}次'.format(count))
         account_list = ADD_COLLECTION if ADD_COLLECTION else self.account_list(
         )
         # if account_list:
         #     continue
         # for account_name in account_list:
         try:
             for account_name in account_list:
                 log.info('第{}次'.format(count))
                 self.search_name = account_name
                 html_account = self.account_homepage()
                 if html_account:
                     html = html_account
                 else:
                     log.info('找到不到微信号首页: '.format(account_name))
                     continue
                 urls_article = self.urls_article(html)
                 # 确定account信息
                 account = Account()
                 account.name = self.name
                 account.account = account_name
                 account.get_account_id()
                 # 判重
                 ids = self.dedup(account_name)
                 entity = None
                 backpack_list = []
                 ftp_list = []
                 ftp_info = None
                 for page_count, url in enumerate(urls_article):
                     # if page_count < 15:
                     #     continue
                     article = Article()
                     try:
                         article.create(url, account)
                     except RuntimeError as run_error:
                         log.info('找不到浏览器 {}'.format(run_error))
                     log.info('第{}条 文章标题: {}'.format(
                         page_count, article.title))
                     log.info("当前文章url: {}".format(url))
                     entity = JsonEntity(article, account)
                     log.info('当前文章ID: {}'.format(entity.id))
                     if entity.id in ids:
                         log.info('当前文章已存在,跳过')
                         continue
                     backpack = Backpack()
                     backpack.create(entity)
                     backpack_list.append(backpack.create_backpack())
                     # self.save_to_mysql(entity)
                     self.save_to_mongo(entity.to_dict())
                     # ftp包
                     ftp_info = Ftp(entity)
                     name_xml = ftp_info.hash_md5(ftp_info.url)
                     log.info('当前文章xml: {}'.format(name_xml))
                     self.create_xml(ftp_info.ftp_dict(), name_xml)
                     ftp_list.append(name_xml)
                     # if page_count >= 3:
                     #     break
                 log.info("发包")
                 # todo 发包超时,修改MTU
                 if ftp_info is not None:
                     entity.uploads_ftp(ftp_info, ftp_list)
                 if entity:
                     # entity.uploads(backpack_list)
                     entity.uploads_datacenter_relay(backpack_list)
                     entity.uploads_datacenter_unity(backpack_list)
                 log.info("发包完成")
         except Exception as e:
             log.exception("解析公众号错误 {}".format(e))
             if 'chrome not reachable' in str(e):
                 raise RuntimeError('chrome not reachable')
             continue