示例#1
0
 def run(self):
     try:
         urls_article = [
             'https://mp.weixin.qq.com/s?src=11&timestamp=1541559601&ver=1229&signature=ixTsG-RvK8H58t6D-CpW6olWI8hA52Wz-FRb12ZcrNG-lxR20YutoyLYUr-RB3w8WHjE1petjDcbbxZVxTChvPWM27qszWu0Z3zonjx8SEQB5mmgm1O9Eu*5qsFhnBCH&new=1'
         ]
         entity = None
         backpack_list = []
         ftp_list = []
         ftp_info = None
         for page_count, url in enumerate(urls_article):
             # if page_count < 15:
             #     continue
             html = requests.get(url)
             # 确定account信息
             name = pq(html.text)('#js_name').text()
             account_name = pq(
                 html.text)('.profile_meta_value').eq(0).text()
             log('---{}---{}---'.format(name, account_name))
             account = Account()
             account.name = name
             account.account = account_name
             account.get_account_id()
             article = Article()
             try:
                 article.create(url, account)
             except RuntimeError as run_error:
                 log('找不到浏览器 {}'.format(run_error))
             log('第{}条 文章标题: {}'.format(page_count, article.title))
             log("当前文章url: {}".format(url))
             entity = JsonEntity(article, account)
             log('当前文章ID: {}'.format(entity.id))
             # if entity.id in ids:
             #     log('当前文章已存在,跳过')
             #     continue
             backpack = Backpack()
             backpack.create(entity)
             backpack_list.append(backpack.create_backpack())
             # self.save_to_mysql(entity)
             self.save_to_mongo(entity.to_dict())
             # ftp包
             ftp_info = Ftp(entity)
             name_xml = ftp_info.hash_md5(ftp_info.url)
             log('当前文章xml: {}'.format(name_xml))
             self.create_xml(ftp_info.ftp_dict(), name_xml)
             ftp_list.append(name_xml)
             # if page_count >= 3:
             #     break
         log("发包")
         # todo 发包超时,修改MTU
         if ftp_info is not None:
             entity.uploads_ftp(ftp_info, ftp_list)
         if entity:
             # entity.uploads(backpack_list)
             entity.uploads_datacenter_relay(backpack_list)
             entity.uploads_datacenter_unity(backpack_list)
         log("发包完成")
     except Exception as e:
         log("解析公众号错误 {}".format(e))
         if 'chrome not reachable' in str(e):
             raise RuntimeError('chrome not reachable')
示例#2
0
    def run(self):
        # self.set_name()
        # while True:
        account_list = ['有看投',]
        entity = None
        backpack_list = []
        for name in account_list:
            self.name = name
            html_account = self.account_homepage()
            if html_account:
                html, account_of_homepage = html_account
            else:
                continue
            log('start 公众号: ', self.name)
            urls_article = self.urls_article(html)

            account = Account()
            account.name = self.name
            account.account = account_of_homepage
            account.get_account_id()
            # account.account_id = 126774646

            for page_count, url in enumerate(urls_article):
                # if page_count < 35:
                #     continue
                article = Article()
                article.create(url, account)
                log('文章标题:', article.title)
                log("第{}条".format(page_count))

                entity = JsonEntity(article, account)
                backpack = Backpack()
                backpack.create(entity)
                backpack_list.append(backpack.create_backpack())
                import pymongo
                conn = pymongo.MongoClient('mongo')
                # 上传数据库
                sql = '''
                        INSERT INTO
                            account_http(article_url, addon, account, account_id, author, id, title)
                        VALUES
                            (%s, %s, %s, %s, %s, %s, %s)
                '''
                _tuple = (
                    article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id,
                    entity.title
                )
                uploads_mysql(config_mysql, sql, _tuple)
                if page_count == 4:
                    break

        log("发包")
        if entity:
            # entity.uploads(backpack_list)
            # entity.uploads_datacenter_relay(backpack_list)
            entity.uploads_datacenter_unity(backpack_list)
            print('end')
示例#3
0
 def run(self):
     count = 0
     while True:
         # ADD_COLLECTION 补采账号  get_account 日常采集; 使用account_list 兼容单个账号和账号列表
         account_list = ADD_COLLECTION if ADD_COLLECTION else self.get_account(
         )
         # length = len(threading.enumerate())  # 枚举返回个列表
         # log.info('当前运行的线程数为:{}'.format(threading.active_count()))
         count += 1
         log.info('第{}次'.format(count))
         if account_list is None:
             log.info('调度队列为空,休眠5秒')
             time.sleep(5)
             continue
         for account_name in account_list:
             try:
                 self.search_name = account_name
                 html_account = self.account_homepage()
                 if html_account:
                     html = html_account
                 else:
                     log.info('{}|找到不到微信号'.format(account_name))
                     continue
                 urls_article = self.urls_article(html)
                 # 确定account信息
                 account = Account()
                 account.name = self.name
                 account.account = account_name
                 account.tags = self.get_tags()
                 account.get_account_id()
                 # 判重
                 ids = self.dedup(account_name) if JUDEG else ''
                 entity = None
                 backpack_list = []
                 ftp_list = []
                 ftp_info = None
                 for page_count, url in enumerate(urls_article):
                     # if page_count < 15:
                     #     continue
                     article = Article()
                     try:
                         article.create(url, account)
                     except RuntimeError as run_error:
                         log.info('微信验证码错误 {}'.format(run_error))
                     log.info('第{}条 文章标题: {}'.format(
                         page_count, article.title))
                     log.info("当前文章url: {}".format(url))
                     entity = JsonEntity(article, account)
                     log.info('当前文章ID: {}'.format(entity.id))
                     if entity.id in ids and JUDEG is True:
                         log.info('当前文章已存在,跳过')
                         # if page_count >= 20:
                         #     log.info('超过20篇文章,跳出')
                         #     break
                         continue
                     backpack = Backpack()
                     backpack.create(entity)
                     backpack_list.append(backpack.create_backpack())
                     # self.save_to_mysql(entity)
                     # self.save_to_mongo(entity.to_dict())
                     # ftp包
                     ftp_info = Ftp(entity)
                     name_xml = ftp_info.hash_md5(ftp_info.url)
                     log.info('当前文章xml: {}'.format(name_xml))
                     self.create_xml(ftp_info.ftp_dict(), name_xml)
                     ftp_list.append(name_xml)
                     # break
                 log.info("开始发包")
                 # todo 发包超时,修改MTU
                 if ftp_info is not None:
                     entity.uploads_ftp(ftp_info, ftp_list)
                     log.info("ftp发包完成")
                 if entity and backpack_list:
                     # entity.uploads(backpack_list)
                     entity.uploads_datacenter_relay(backpack_list)
                     entity.uploads_datacenter_unity(backpack_list)
                     log.info("数据中心,三合一,发包完成")
             except Exception as e:
                 log.exception("解析公众号错误 {}".format(e))
                 if 'chrome not reachable' in str(e):
                     raise RuntimeError('chrome not reachable')
         if ADD_COLLECTION:
             break
示例#4
0
    def run(self):
        count = 0
        while True:
            # ADD_COLLECTION 补采账号  get_account 日常采集; 使用account_list 兼容单个账号和账号列表
            account_list = ADD_COLLECTION if ADD_COLLECTION else self.get_account(
            )
            # length = len(threading.enumerate())  # 枚举返回个列表
            log.info('当前运行的线程数为:{}'.format(threading.active_count()))
            log.info('当前运行的进程:{}'.format(
                multiprocessing.current_process().name))
            count += 1
            log.info('第{}次'.format(count))
            if account_list is None:
                log.info('调度队列为空,休眠5秒')
                time.sleep(5)
                continue
            for account_name in account_list:
                try:
                    self.search_name = account_name
                    html_account = self.account_homepage()
                    if html_account:
                        html = html_account
                    else:
                        log.info('{}|找到不到微信号'.format(account_name))
                        continue
                    urls_article = self.urls_article(html)
                    # 确定account信息
                    account = Account()
                    account.name = self.name
                    account.account = account_name
                    account.tags = self.get_tags()
                    account.get_account_id()
                    # 判重 查底层
                    # ids = self.dedup(account_name) if JUDEG else ''
                    # 判重 redis
                    sentenced_keys = account.account + ' ' + str(
                        account.account_id)
                    keys = hash_md5(sentenced_keys)
                    log.info('keys: {}'.format(keys))
                    dedup_result = self.dedup_redis(keys)
                    post_dedup_urls = []

                    entity = None
                    backpack_list = []
                    ftp_list = []
                    ftp_info = None
                    for page_count, url in enumerate(urls_article):
                        try:
                            # if page_count > 5:
                            #     break
                            article = Article()
                            article.create(url, account, self.proxies)
                            log.info('第{}条 文章标题: {}'.format(
                                page_count, article.title))
                            log.info("当前文章url: {}".format(url))
                            entity = JsonEntity(article, account)
                            log.info('当前文章ID: {}'.format(entity.id))
                            article_date = datetime.datetime.fromtimestamp(
                                int(str(article.time)[:-3]))
                            day_diff = datetime.date.today(
                            ) - article_date.date()
                            if day_diff.days > 15:
                                log.info(
                                    '超过采集interval最大15天 的文章不采集,已采集{}条文章'.format(
                                        page_count))
                                self.count_articles(page_count)
                                break
                            if dedup_result:
                                # title_time_str = entity.title + str(entity.time)
                                # title_time_md5 = hash_md5(title_time_str)
                                if entity.id in dedup_result:
                                    log.info('当前文章已存在,跳过')
                                    continue
                                else:
                                    post_dedup_urls.append(entity.id)
                            else:
                                # title_time_str = entity.title + str(entity.time)
                                # title_time_md5 = hash_md5(title_time_str)
                                post_dedup_urls.append(entity.id)

                            # dedup_result = self.dedup_redis(entity)
                            # if dedup_result:
                            #     log.info('当前文章已存在,跳过')
                            # ids = ids.append({'key': entity.id, 'urls': entity.url})
                            # if entity.id in ids and JUDEG is True:
                            #     log.info('当前文章已存在,跳过')
                            #     continue
                            backpack = Backpack()
                            backpack.create(entity)
                            backpack_list.append(backpack.create_backpack())
                            # self.save_to_mysql(entity)
                            # self.save_to_mongo(entity.to_dict())
                            # ftp包
                            ftp_info = Ftp(entity)
                            name_xml = ftp_info.hash_md5(ftp_info.url)
                            log.info('当前文章xml: {}'.format(name_xml))
                            self.create_xml(ftp_info.ftp_dict(), name_xml)
                            ftp_list.append(name_xml)
                        except Exception as run_error:
                            log.info('微信解析文章错误 {}'.format(run_error))
                            continue

                    log.info("开始发包")
                    if entity and backpack_list:
                        # 直接发底层
                        # entity.uploads(backpack_list)
                        entity.uploads_datacenter_relay(backpack_list)
                        entity.uploads_datacenter_unity(backpack_list)
                        log.info("数据中心,三合一,发包完成")
                    else:
                        log.info('包列表为空,不发送数据')
                        continue
                    # todo 发包超时,修改MTU
                    if ftp_info is not None:
                        entity.uploads_ftp(ftp_info, ftp_list)
                        log.info("ftp发包完成")
                    if post_dedup_urls:
                        log.info('上传判重中心key:{} urls:{}'.format(
                            keys, post_dedup_urls))
                        url = 'http://47.100.53.87:8008/Schedule/CacheWx'
                        data = [{
                            "key": keys,
                            "sourceNodes": "1",
                            "sourceType": "2",
                            "urls": post_dedup_urls
                        }]
                        r = requests.post(url,
                                          data=json.dumps(data),
                                          timeout=self.timeout)
                        log.info('上传判重中心结果{}'.format(r.status_code))
                except Exception as e:
                    log.exception("解析公众号错误 {}".format(e))
                    time.sleep(30)
                    if ('chrome not reachable'
                            in str(e)) or ('Message: timeout' in str(e)):
                        raise RuntimeError('chrome not reachable')
            if ADD_COLLECTION:
                break
示例#5
0
 def run(self):
     count = 0
     while True:
         count += 1
         log.info('第{}次'.format(count))
         account_list = ADD_COLLECTION if ADD_COLLECTION else self.account_list(
         )
         # if account_list:
         #     continue
         # for account_name in account_list:
         try:
             for account_name in account_list:
                 log.info('第{}次'.format(count))
                 self.search_name = account_name
                 html_account = self.account_homepage()
                 if html_account:
                     html = html_account
                 else:
                     log.info('找到不到微信号首页: '.format(account_name))
                     continue
                 urls_article = self.urls_article(html)
                 # 确定account信息
                 account = Account()
                 account.name = self.name
                 account.account = account_name
                 account.get_account_id()
                 # 判重
                 ids = self.dedup(account_name)
                 entity = None
                 backpack_list = []
                 ftp_list = []
                 ftp_info = None
                 for page_count, url in enumerate(urls_article):
                     # if page_count < 15:
                     #     continue
                     article = Article()
                     try:
                         article.create(url, account)
                     except RuntimeError as run_error:
                         log.info('找不到浏览器 {}'.format(run_error))
                     log.info('第{}条 文章标题: {}'.format(
                         page_count, article.title))
                     log.info("当前文章url: {}".format(url))
                     entity = JsonEntity(article, account)
                     log.info('当前文章ID: {}'.format(entity.id))
                     if entity.id in ids:
                         log.info('当前文章已存在,跳过')
                         continue
                     backpack = Backpack()
                     backpack.create(entity)
                     backpack_list.append(backpack.create_backpack())
                     # self.save_to_mysql(entity)
                     self.save_to_mongo(entity.to_dict())
                     # ftp包
                     ftp_info = Ftp(entity)
                     name_xml = ftp_info.hash_md5(ftp_info.url)
                     log.info('当前文章xml: {}'.format(name_xml))
                     self.create_xml(ftp_info.ftp_dict(), name_xml)
                     ftp_list.append(name_xml)
                     # if page_count >= 3:
                     #     break
                 log.info("发包")
                 # todo 发包超时,修改MTU
                 if ftp_info is not None:
                     entity.uploads_ftp(ftp_info, ftp_list)
                 if entity:
                     # entity.uploads(backpack_list)
                     entity.uploads_datacenter_relay(backpack_list)
                     entity.uploads_datacenter_unity(backpack_list)
                 log.info("发包完成")
         except Exception as e:
             log.exception("解析公众号错误 {}".format(e))
             if 'chrome not reachable' in str(e):
                 raise RuntimeError('chrome not reachable')
             continue
示例#6
0
 def run(self):
     count = 0
     while True:
         count += 1
         log.info('第{}次'.format(count))
         # ADD_COLLECTION 补采账号  get_account 日常采集; 使用account_list 兼容单个账号和账号列表
         account_list = ADD_COLLECTION if ADD_COLLECTION else self.get_account()
         if account_list is None:
             log.info('调度队列为空,休眠5秒')
             time.sleep(5)
         for account_name in account_list:
             try:
                 self.search_name = account_name
                 html_account = self.account_homepage()
                 if html_account:
                     html = html_account
                 else:
                     log.info('{}|找到不到微信号'.format(account_name))
                     continue
                 urls_article = self.urls_article(html)
                 # 确定account信息
                 account = Account()
                 account.name = self.name
                 account.account = account_name
                 account.tags = self.get_tags()
                 account.get_account_id()
                 if not account.account_id:
                     log.info('没有account_id'.format(self.name))
                     break
                 # 判重
                 ids = self.dedup(account_name) if JUDEG else ''
                 entity = None
                 backpack_list = []
                 ftp_list = []
                 ftp_info = None
                 for page_count, url in enumerate(urls_article):
                     # if page_count < 15:
                     #     continue
                     article = Article()
                     try:
                         article.create(url, account)
                     except RuntimeError as run_error:
                         log.info('找不到浏览器 {}'.format(run_error))
                     log.info('第{}条 文章标题: {}'.format(page_count, article.title))
                     log.info("当前文章url: {}".format(url))
                     entity = JsonEntity(article, account)
                     log.info('当前文章ID: {}'.format(entity.id))
                     if entity.id in ids and JUDEG is True:
                         log.info('当前文章已存在,跳过0')
                         # continue
                     backpack = Backpack()
                     backpack.create(entity)
                     backpack_list.append(backpack.create_backpack())
                     # self.save_to_mysql(entity)
                     # self.save_to_mongo(entity.to_dict())
                     # if page_count >= 3:
                     #     break
                 log.info("开始发包")
                 if entity and backpack_list:
                     entity.uploads(backpack_list)
                     log.info("发包完成")
             except Exception as e:
                 log.exception("解析公众号错误 {}".format(e))
                 if 'chrome not reachable' in str(e):
                     raise RuntimeError('chrome not reachable')
示例#7
0
    def run(self):
        html_account = self.account_homepage()
        if html_account:
            html, account_of_homepage = html_account
        else:
            self.send_result()
            return
        log('start 公众号: ', self.name)
        urls_article = self.urls_article(html)

        account = Account()
        account.name = self.name
        account.account = account_of_homepage
        account.get_account_id()

        articles = []
        backpack_list = []
        positive_article = 0
        nagetive_article = 0
        for page_count, url in enumerate(urls_article):
            # if page_count > 2:
            #     break
            article = Article()
            log('url:', url)
            article.create(url, self.name)
            log('第{}条, 文章标题: {}'.format(page_count, article.title))

            # 超过7天不管
            if article.time:
                article_date = datetime.datetime.fromtimestamp(
                    int(article.time[:-3]))
                day_diff = datetime.datetime.now().date() - article_date.date()
                if day_diff.days > 6:
                    break
            # 统计文章正负面
            count_positive, count_nagetive = self.emotion_judge(
                article.content)
            if count_positive > count_nagetive:
                positive_article += 1
            else:
                nagetive_article += 1
            entity = JsonEntity(article, account)
            backpack = Backpack()
            backpack.create(entity)
            backpack_list.append(backpack.create_backpack())
            # 所有文章
            article_info = backpack.to_dict()
            articles.append(article_info)
        log('所有文章抓取完毕')
        content_all_list = ''
        for article in articles:
            content_all_list += article.get('Content')
        log('文章长度', len(content_all_list))
        # 分词处理
        key_words_list = []
        GETNER_API_URL = 'http://221.204.232.7:40015/NER/GetNer'
        data = {
            "texts": [content_all_list],
        }
        log('请求分词')
        response = requests.post(url=GETNER_API_URL, data=data, timeout=180)
        ner_result = response.json().get('rst')[0]
        if ner_result.get('status') == 'success':
            org_dic = ner_result.get('ner').get('ORG')
            loc_dic = ner_result.get('ner').get('LOC')
            per_dic = ner_result.get('ner').get('PER')
            if org_dic:
                for i in org_dic.items():
                    key_words_list.append(i)
            if loc_dic:
                for i in loc_dic.items():
                    key_words_list.append(i)
            if per_dic:
                for i in per_dic.items():
                    key_words_list.append(i)

        # 返回前20个出现频率最高的词
        key_words = dict()
        key_words['list'] = []
        key_words_list = sorted(key_words_list,
                                key=lambda x: x[1],
                                reverse=True)[:21]
        for k in key_words_list:
            key_words['list'].append({"times": k[1], "keyword": k[0]})
        log('分词完成')
        # 处理文章
        result = handle(articles)
        result['KeyWord'] = key_words
        result['ArtPosNeg'] = {
            'Indicate': {
                'Positive': positive_article,
                'Negative': nagetive_article
            }
        }
        result['Success'] = True
        result['Account'] = self.name
        result['Message'] = ''
        db['newMedia'].update({'Account': self.name},
                              {'$set': {
                                  'data': result
                              }})
        log('{} 抓取完成'.format(self.name))
        # 向前端发送成功请求
        self.status = 3
        self.send_result()
示例#8
0
    def run(self):
        html_account = self.account_homepage()
        if html_account:
            html, account_of_homepage = html_account
        else:
            # self.send_result()
            return
        log('start 公众号: ', self.name)
        urls_article = self.urls_article(html)

        account = Account()
        account.name = self.name
        account.account = account_of_homepage
        account.get_account_id()

        articles = []
        backpack_list = []
        positive_article = 0
        nagetive_article = 0
        for page_count, url in enumerate(urls_article):
            # if page_count > 2:
            #     break
            article = Article()
            log('url:', url)
            article.create(url, self.name)
            log('文章标题:', article.title)
            log("第{}条".format(page_count))

            # 超过7天不管
            if article.time:
                article_date = datetime.datetime.fromtimestamp(
                    int(article.time[:-3]))
                day_diff = datetime.datetime.now().date() - article_date.date()
                if day_diff.days > 6:
                    break
            # 统计文章正负面
            count_positive, count_nagetive = self.emotion_judge(
                article.content)
            if count_positive > count_nagetive:
                positive_article += 1
            else:
                nagetive_article += 1
            entity = JsonEntity(article, account)
            backpack = Backpack()
            backpack.create(entity)
            backpack_list.append(backpack.create_backpack())
            # 所有文章
            article_info = backpack.to_dict()
            articles.append(article_info)
        log('所有文章抓取完毕')
        content_all_list = ''
        for article in articles:
            content_all_list += article.get('Content')
        # 分词处理
        key_words_list = []
        thu1 = thulac.thulac()
        seg_list = thu1.cut(''.join(content_all_list), text=False)
        for s in seg_list:
            if (len(s[0]) >= 2 and re.search('[\u4e00-\u9fff]+', s[0])
                    and s[1] in ['n', 'np', 'ns', 'ni', 'nz']):
                key_words_list.append(s[0])

        # 返回前20个出现频率最高的词
        key_words_counter = Counter(key_words_list).most_common(20)
        key_word = dict()
        key_word['list'] = []
        for k in key_words_counter:
            key_word['list'].append({"times": k[1], "keyword": k[0]})
        # 处理文章
        result = handle(articles)
        result['KeyWord'] = key_word
        result['ArtPosNeg'] = {
            'Indicate': {
                'Positive': positive_article,
                'Negative': nagetive_article
            }
        }
        result['Success'] = True
        result['Account'] = self.name
        result['Message'] = ''
        db['newMedia'].update({'Account': self.name},
                              {'$set': {
                                  'data': result
                              }})
        log('{} 抓取完成'.format(self.name))
        # 向前端发送成功请求
        self.status = 3