示例#1
0
 def run(self):
     try:
         urls_article = [
             'https://mp.weixin.qq.com/s?src=11&timestamp=1541559601&ver=1229&signature=ixTsG-RvK8H58t6D-CpW6olWI8hA52Wz-FRb12ZcrNG-lxR20YutoyLYUr-RB3w8WHjE1petjDcbbxZVxTChvPWM27qszWu0Z3zonjx8SEQB5mmgm1O9Eu*5qsFhnBCH&new=1'
         ]
         entity = None
         backpack_list = []
         ftp_list = []
         ftp_info = None
         for page_count, url in enumerate(urls_article):
             # if page_count < 15:
             #     continue
             html = requests.get(url)
             # 确定account信息
             name = pq(html.text)('#js_name').text()
             account_name = pq(
                 html.text)('.profile_meta_value').eq(0).text()
             log('---{}---{}---'.format(name, account_name))
             account = Account()
             account.name = name
             account.account = account_name
             account.get_account_id()
             article = Article()
             try:
                 article.create(url, account)
             except RuntimeError as run_error:
                 log('找不到浏览器 {}'.format(run_error))
             log('第{}条 文章标题: {}'.format(page_count, article.title))
             log("当前文章url: {}".format(url))
             entity = JsonEntity(article, account)
             log('当前文章ID: {}'.format(entity.id))
             # if entity.id in ids:
             #     log('当前文章已存在,跳过')
             #     continue
             backpack = Backpack()
             backpack.create(entity)
             backpack_list.append(backpack.create_backpack())
             # self.save_to_mysql(entity)
             self.save_to_mongo(entity.to_dict())
             # ftp包
             ftp_info = Ftp(entity)
             name_xml = ftp_info.hash_md5(ftp_info.url)
             log('当前文章xml: {}'.format(name_xml))
             self.create_xml(ftp_info.ftp_dict(), name_xml)
             ftp_list.append(name_xml)
             # if page_count >= 3:
             #     break
         log("发包")
         # todo 发包超时,修改MTU
         if ftp_info is not None:
             entity.uploads_ftp(ftp_info, ftp_list)
         if entity:
             # entity.uploads(backpack_list)
             entity.uploads_datacenter_relay(backpack_list)
             entity.uploads_datacenter_unity(backpack_list)
         log("发包完成")
     except Exception as e:
         log("解析公众号错误 {}".format(e))
         if 'chrome not reachable' in str(e):
             raise RuntimeError('chrome not reachable')
示例#2
0
 def run(self):
     count = 0
     while True:
         # ADD_COLLECTION 补采账号  get_account 日常采集; 使用account_list 兼容单个账号和账号列表
         account_list = ADD_COLLECTION if ADD_COLLECTION else self.get_account(
         )
         # length = len(threading.enumerate())  # 枚举返回个列表
         # log.info('当前运行的线程数为:{}'.format(threading.active_count()))
         count += 1
         log.info('第{}次'.format(count))
         if account_list is None:
             log.info('调度队列为空,休眠5秒')
             time.sleep(5)
             continue
         for account_name in account_list:
             try:
                 self.search_name = account_name
                 html_account = self.account_homepage()
                 if html_account:
                     html = html_account
                 else:
                     log.info('{}|找到不到微信号'.format(account_name))
                     continue
                 urls_article = self.urls_article(html)
                 # 确定account信息
                 account = Account()
                 account.name = self.name
                 account.account = account_name
                 account.tags = self.get_tags()
                 account.get_account_id()
                 # 判重
                 ids = self.dedup(account_name) if JUDEG else ''
                 entity = None
                 backpack_list = []
                 ftp_list = []
                 ftp_info = None
                 for page_count, url in enumerate(urls_article):
                     # if page_count < 15:
                     #     continue
                     article = Article()
                     try:
                         article.create(url, account)
                     except RuntimeError as run_error:
                         log.info('微信验证码错误 {}'.format(run_error))
                     log.info('第{}条 文章标题: {}'.format(
                         page_count, article.title))
                     log.info("当前文章url: {}".format(url))
                     entity = JsonEntity(article, account)
                     log.info('当前文章ID: {}'.format(entity.id))
                     if entity.id in ids and JUDEG is True:
                         log.info('当前文章已存在,跳过')
                         # if page_count >= 20:
                         #     log.info('超过20篇文章,跳出')
                         #     break
                         continue
                     backpack = Backpack()
                     backpack.create(entity)
                     backpack_list.append(backpack.create_backpack())
                     # self.save_to_mysql(entity)
                     # self.save_to_mongo(entity.to_dict())
                     # ftp包
                     ftp_info = Ftp(entity)
                     name_xml = ftp_info.hash_md5(ftp_info.url)
                     log.info('当前文章xml: {}'.format(name_xml))
                     self.create_xml(ftp_info.ftp_dict(), name_xml)
                     ftp_list.append(name_xml)
                     # break
                 log.info("开始发包")
                 # todo 发包超时,修改MTU
                 if ftp_info is not None:
                     entity.uploads_ftp(ftp_info, ftp_list)
                     log.info("ftp发包完成")
                 if entity and backpack_list:
                     # entity.uploads(backpack_list)
                     entity.uploads_datacenter_relay(backpack_list)
                     entity.uploads_datacenter_unity(backpack_list)
                     log.info("数据中心,三合一,发包完成")
             except Exception as e:
                 log.exception("解析公众号错误 {}".format(e))
                 if 'chrome not reachable' in str(e):
                     raise RuntimeError('chrome not reachable')
         if ADD_COLLECTION:
             break
示例#3
0
    def run(self):
        count = 0
        while True:
            # ADD_COLLECTION 补采账号  get_account 日常采集; 使用account_list 兼容单个账号和账号列表
            account_list = ADD_COLLECTION if ADD_COLLECTION else self.get_account(
            )
            # length = len(threading.enumerate())  # 枚举返回个列表
            log.info('当前运行的线程数为:{}'.format(threading.active_count()))
            log.info('当前运行的进程:{}'.format(
                multiprocessing.current_process().name))
            count += 1
            log.info('第{}次'.format(count))
            if account_list is None:
                log.info('调度队列为空,休眠5秒')
                time.sleep(5)
                continue
            for account_name in account_list:
                try:
                    self.search_name = account_name
                    html_account = self.account_homepage()
                    if html_account:
                        html = html_account
                    else:
                        log.info('{}|找到不到微信号'.format(account_name))
                        continue
                    urls_article = self.urls_article(html)
                    # 确定account信息
                    account = Account()
                    account.name = self.name
                    account.account = account_name
                    account.tags = self.get_tags()
                    account.get_account_id()
                    # 判重 查底层
                    # ids = self.dedup(account_name) if JUDEG else ''
                    # 判重 redis
                    sentenced_keys = account.account + ' ' + str(
                        account.account_id)
                    keys = hash_md5(sentenced_keys)
                    log.info('keys: {}'.format(keys))
                    dedup_result = self.dedup_redis(keys)
                    post_dedup_urls = []

                    entity = None
                    backpack_list = []
                    ftp_list = []
                    ftp_info = None
                    for page_count, url in enumerate(urls_article):
                        try:
                            # if page_count > 5:
                            #     break
                            article = Article()
                            article.create(url, account, self.proxies)
                            log.info('第{}条 文章标题: {}'.format(
                                page_count, article.title))
                            log.info("当前文章url: {}".format(url))
                            entity = JsonEntity(article, account)
                            log.info('当前文章ID: {}'.format(entity.id))
                            article_date = datetime.datetime.fromtimestamp(
                                int(str(article.time)[:-3]))
                            day_diff = datetime.date.today(
                            ) - article_date.date()
                            if day_diff.days > 15:
                                log.info(
                                    '超过采集interval最大15天 的文章不采集,已采集{}条文章'.format(
                                        page_count))
                                self.count_articles(page_count)
                                break
                            if dedup_result:
                                # title_time_str = entity.title + str(entity.time)
                                # title_time_md5 = hash_md5(title_time_str)
                                if entity.id in dedup_result:
                                    log.info('当前文章已存在,跳过')
                                    continue
                                else:
                                    post_dedup_urls.append(entity.id)
                            else:
                                # title_time_str = entity.title + str(entity.time)
                                # title_time_md5 = hash_md5(title_time_str)
                                post_dedup_urls.append(entity.id)

                            # dedup_result = self.dedup_redis(entity)
                            # if dedup_result:
                            #     log.info('当前文章已存在,跳过')
                            # ids = ids.append({'key': entity.id, 'urls': entity.url})
                            # if entity.id in ids and JUDEG is True:
                            #     log.info('当前文章已存在,跳过')
                            #     continue
                            backpack = Backpack()
                            backpack.create(entity)
                            backpack_list.append(backpack.create_backpack())
                            # self.save_to_mysql(entity)
                            # self.save_to_mongo(entity.to_dict())
                            # ftp包
                            ftp_info = Ftp(entity)
                            name_xml = ftp_info.hash_md5(ftp_info.url)
                            log.info('当前文章xml: {}'.format(name_xml))
                            self.create_xml(ftp_info.ftp_dict(), name_xml)
                            ftp_list.append(name_xml)
                        except Exception as run_error:
                            log.info('微信解析文章错误 {}'.format(run_error))
                            continue

                    log.info("开始发包")
                    if entity and backpack_list:
                        # 直接发底层
                        # entity.uploads(backpack_list)
                        entity.uploads_datacenter_relay(backpack_list)
                        entity.uploads_datacenter_unity(backpack_list)
                        log.info("数据中心,三合一,发包完成")
                    else:
                        log.info('包列表为空,不发送数据')
                        continue
                    # todo 发包超时,修改MTU
                    if ftp_info is not None:
                        entity.uploads_ftp(ftp_info, ftp_list)
                        log.info("ftp发包完成")
                    if post_dedup_urls:
                        log.info('上传判重中心key:{} urls:{}'.format(
                            keys, post_dedup_urls))
                        url = 'http://47.100.53.87:8008/Schedule/CacheWx'
                        data = [{
                            "key": keys,
                            "sourceNodes": "1",
                            "sourceType": "2",
                            "urls": post_dedup_urls
                        }]
                        r = requests.post(url,
                                          data=json.dumps(data),
                                          timeout=self.timeout)
                        log.info('上传判重中心结果{}'.format(r.status_code))
                except Exception as e:
                    log.exception("解析公众号错误 {}".format(e))
                    time.sleep(30)
                    if ('chrome not reachable'
                            in str(e)) or ('Message: timeout' in str(e)):
                        raise RuntimeError('chrome not reachable')
            if ADD_COLLECTION:
                break
示例#4
0
 def run(self):
     count = 0
     while True:
         count += 1
         log.info('第{}次'.format(count))
         account_list = ADD_COLLECTION if ADD_COLLECTION else self.account_list(
         )
         # if account_list:
         #     continue
         # for account_name in account_list:
         try:
             for account_name in account_list:
                 log.info('第{}次'.format(count))
                 self.search_name = account_name
                 html_account = self.account_homepage()
                 if html_account:
                     html = html_account
                 else:
                     log.info('找到不到微信号首页: '.format(account_name))
                     continue
                 urls_article = self.urls_article(html)
                 # 确定account信息
                 account = Account()
                 account.name = self.name
                 account.account = account_name
                 account.get_account_id()
                 # 判重
                 ids = self.dedup(account_name)
                 entity = None
                 backpack_list = []
                 ftp_list = []
                 ftp_info = None
                 for page_count, url in enumerate(urls_article):
                     # if page_count < 15:
                     #     continue
                     article = Article()
                     try:
                         article.create(url, account)
                     except RuntimeError as run_error:
                         log.info('找不到浏览器 {}'.format(run_error))
                     log.info('第{}条 文章标题: {}'.format(
                         page_count, article.title))
                     log.info("当前文章url: {}".format(url))
                     entity = JsonEntity(article, account)
                     log.info('当前文章ID: {}'.format(entity.id))
                     if entity.id in ids:
                         log.info('当前文章已存在,跳过')
                         continue
                     backpack = Backpack()
                     backpack.create(entity)
                     backpack_list.append(backpack.create_backpack())
                     # self.save_to_mysql(entity)
                     self.save_to_mongo(entity.to_dict())
                     # ftp包
                     ftp_info = Ftp(entity)
                     name_xml = ftp_info.hash_md5(ftp_info.url)
                     log.info('当前文章xml: {}'.format(name_xml))
                     self.create_xml(ftp_info.ftp_dict(), name_xml)
                     ftp_list.append(name_xml)
                     # if page_count >= 3:
                     #     break
                 log.info("发包")
                 # todo 发包超时,修改MTU
                 if ftp_info is not None:
                     entity.uploads_ftp(ftp_info, ftp_list)
                 if entity:
                     # entity.uploads(backpack_list)
                     entity.uploads_datacenter_relay(backpack_list)
                     entity.uploads_datacenter_unity(backpack_list)
                 log.info("发包完成")
         except Exception as e:
             log.exception("解析公众号错误 {}".format(e))
             if 'chrome not reachable' in str(e):
                 raise RuntimeError('chrome not reachable')
             continue