示例#1
0
    def run(self):
        name_list = self.get_name()
        for name in name_list:
            # name = '大鼎豫剧'
            log('start {}'.format(name))
            self.name = name
            _tuple = self.account_homepage(name)
            # 跳过 搜索不到的公众号
            if _tuple:
                html, _account = self.account_homepage(name)
            else:
                log('not find {}'.format(self.name))
                continue
            # 所有文章链接
            items = re.findall('"content_url":".*?,"copyright_stat"', html)
            backpack_list = []
            for page_count, item in enumerate(items):
                url_last = item[15:-18].replace('amp;', '')
                url = 'https://mp.weixin.qq.com' + url_last
                article = Article()
                article.create(url)
                if article.is_share is True:
                    continue
                log("catch {}".format(article.title))
                account = Account()
                # account 读文件跟信源搜索不一样
                account.name = article.author
                account.account = article.account
                account.get_account_id()
                entity = JsonEntity(article, account)
                backpack = Backpack()
                # 文章为分享
                # try:
                backpack.create(entity)
                # except Exception as e:
                #     log(e)
                #     continue
                backpack_list.append(backpack.create_backpack())

                # 上传数据库
                sql = '''   
                    INSERT INTO 
                        account_http(article_url, addon, account, account_id, author, id, title) 
                    VALUES 
                        (%s, %s, %s, %s, %s, %s, %s)
                '''
                _tuple = (
                    article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id,
                    entity.title
                )
                uploads_mysql(config_mysql, sql, _tuple)
                # if page_count == 30:
                #     break
            log('catch {} successul 共{}条文章'.format(self.name, page_count))

            log("发包")
            if entity:
                entity.uploads(backpack_list)
                log("uploads successful")
示例#2
0
    def run(self):
        # self.set_name()
        # while True:
        account_list = [
            '晚聊伴夜', '氢氪财经', '菲迪克智慧工程企业管理平台', '山西同乡群', '筱猫影视', '沈阳南动车运用所',
            '潇湘茶', '众智睿赢企业管理咨询有限公司', '微景相册', '书悦堂', '分享好宝贝', '民艺旅舍', '女王Dcup',
            '轻松定位美丽', '乐清市红辣椒越剧艺苑', '畅舞馆', '人禾健康产业', '常州格物斯坦机器人创客中心', '千秋妃子',
            '崇左航博'
        ]

        for name in account_list:
            self.name = name
            html_account = self.account_homepage()
            if html_account:
                html, account_of_homepage = html_account
            else:
                continue
            log('start 公众号: ', self.name)
            urls_article = self.urls_article(html)

            account = Acount()
            account.name = self.name
            account.account = account_of_homepage
            account.get_account_id()

            backpack_list = []
            for page_count, url in enumerate(urls_article):
                # if page_count < 35:
                #     continue
                article = Article()
                article.create(url, self.name)
                log('文章标题:', article.title)
                log("第{}条".format(page_count))

                entity = JsonEntity(article, account)
                backpack = Backpack()
                backpack.create(entity)
                backpack_list.append(backpack.create_backpack())

                import pymongo
                conn = pymongo.MongoClient('mongo')
                # 上传数据库
                sql = '''   
                        INSERT INTO 
                            account_http(article_url, addon, account, account_id, author, id, title) 
                        VALUES 
                            (%s, %s, %s, %s, %s, %s, %s)
                '''
                _tuple = (article.url, datetime.datetime.now(), entity.account,
                          entity.account_id, entity.author, entity.id,
                          entity.title)
                uploads_mysql(config_mysql, sql, _tuple)
                # if page_count == 5:
                #     break

        log("发包")
        if entity:
            entity.uploads(backpack_list)
示例#3
0
    def run(self):
        # self.set_name()
        # while True:
        account_list = ['有看投',]
        entity = None
        backpack_list = []
        for name in account_list:
            self.name = name
            html_account = self.account_homepage()
            if html_account:
                html, account_of_homepage = html_account
            else:
                continue
            log('start 公众号: ', self.name)
            urls_article = self.urls_article(html)

            account = Account()
            account.name = self.name
            account.account = account_of_homepage
            account.get_account_id()
            # account.account_id = 126774646

            for page_count, url in enumerate(urls_article):
                # if page_count < 35:
                #     continue
                article = Article()
                article.create(url, account)
                log('文章标题:', article.title)
                log("第{}条".format(page_count))

                entity = JsonEntity(article, account)
                backpack = Backpack()
                backpack.create(entity)
                backpack_list.append(backpack.create_backpack())
                import pymongo
                conn = pymongo.MongoClient('mongo')
                # 上传数据库
                sql = '''
                        INSERT INTO
                            account_http(article_url, addon, account, account_id, author, id, title)
                        VALUES
                            (%s, %s, %s, %s, %s, %s, %s)
                '''
                _tuple = (
                    article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id,
                    entity.title
                )
                uploads_mysql(config_mysql, sql, _tuple)
                if page_count == 4:
                    break

        log("发包")
        if entity:
            # entity.uploads(backpack_list)
            # entity.uploads_datacenter_relay(backpack_list)
            entity.uploads_datacenter_unity(backpack_list)
            print('end')
示例#4
0
    def run(self):
        # self.set_name()
        # while True:
        account_list = global_account_list or [
            '刀口谈兵',
        ]

        for name in account_list:
            self.name = name
            html_account = self.account_homepage()
            if html_account:
                html, account_of_homepage = html_account
            else:
                continue
            log('start 公众号: ', self.name)
            urls_article = self.urls_article(html)

            account = Account()
            account.name = self.name
            account.account = account_of_homepage
            # account.get_account_id()
            account.account_id = global_account_id or 126776905

            entity = None
            backpack_list = []
            for page_count, url in enumerate(urls_article):
                if page_count == 0:
                    continue
                article = Article()
                article.create(url, self.name)
                log('文章标题:', article.title)
                log("第{}条".format(page_count))

                entity = JsonEntity(article, account)
                backpack = Backpack()
                backpack.create(entity)
                backpack_list.append(backpack.create_backpack())
                # 上传数据库
                sql = '''
                        INSERT INTO
                            account_http(article_url, addon, account, account_id, author, id, title)
                        VALUES
                            (%s, %s, %s, %s, %s, %s, %s)
                '''
                _tuple = (article.url, datetime.datetime.now(), entity.account,
                          entity.account_id, entity.author, entity.id,
                          entity.title)
                uploads_mysql(config_mysql, sql, _tuple)
                if page_count == 5:
                    break

            log("发包")
            if entity:
                entity.uploads(backpack_list)
                # entity.uploads_datacenter(backpack_list)
                print('end')
示例#5
0
    def run(self):
        biz = 'MjM5MDYxNzcwNA'
        self.url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={}==&uin=MTA2NjAyMjkyMA==&key=2d9f0071c5e3853997fb949a262f6e94880e0112444bb490548a0fa39b12232e3736b5d92b5f923541e1aa4a3dbd3c63d4bc9cbfc09dcb9911e82e9f497263a10535ea271c5479e87a2a7623e8179aa0'.format(biz)
        resp = requests.get(self.url, headers=self.headers)
        match_url = re.search('var msgList =.*?\';', resp.text).group()
        escape_url = html.unescape(match_url)

        urls = re.findall('content_url.*?mp.weixin.qq.com.*?#wechat_redirect', escape_url)
        prefix = 'https://mp.weixin.qq.com/s?'
        backpack_list = []
        article_count = 0
        for article_count, url in enumerate(urls):
            url = prefix + url.replace('amp;', '').replace(r'content_url":'
                                                           r'"http:\\/\\/mp.weixin.qq.com\\/s?', '')
            log('文章链接', url)
            article = Article()
            article.create(url)
            log("文章标题 {}".format(article.title))
            account = Acount()
            # account 读文件跟信源搜索不一样
            account.name = article.author
            account.account = article.account
            account.get_account_id()
            entity = JsonEntity(article, account)
            backpack = Backpack()

            # 文章为分享,正则匹配不到时间,会异常
            try:
                backpack.create(entity)
            except Exception as e:
                log('share error', e)
                continue
            backpack_list.append(backpack.create_backpack())

            # 上传数据库
            sql = '''
            INSERT INTO
                account_http(article_url, addon, account, account_id, author, id, title)
            VALUES
                (%s, %s, %s, %s, %s, %s, %s)
                    '''
            _tuple = (
                entity.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author,
                entity.id,
                entity.title
            )
            uploads_mysql(config_mysql, sql, _tuple)
            # # if article_count == 30:
            # #     break
        log('采集账号:{} 所有文章完毕,共{}条文章'.format(self.name, article_count + 1))

        log("发包")
        if entity:
            entity.uploads(backpack_list)
            log("uploads successful")
示例#6
0
 def save_to_mysql(entity):
     # 上传数据库
     # log.info('开始上传mysql')
     sql = '''   
             INSERT INTO 
                 account_http(article_url, addon, account, account_id, author, id, title) 
             VALUES 
                 (%s, %s, %s, %s, %s, %s, %s)
     '''
     _tuple = (entity.url, datetime.datetime.now(), entity.account,
               entity.account_id, entity.author, entity.id, entity.title)
     try:
         config_mysql = get_mysql_new()
         uploads_mysql(config_mysql, sql, _tuple)
     except Exception as e:
         log.info('数据库上传错误 {}'.format(e))
示例#7
0
    def run(self):
        self.set_key_uin()
        _biz_list = self.biz_list()
        if _biz_list:
            for biz in _biz_list:
                self._biz = biz
                self.create_url()
                log(self.url)
                resp = requests.get(self.url, headers=self.headers)
                # 响应结果为空
                if len(resp.text) == 0:
                    self.set_key_uin()
                    resp = requests.get(self.url, headers=self.headers)
                    log('response 为空')
                    break

                match_url = re.search('var msgList =.*?\';', resp.text).group()
                escape_url = html.unescape(match_url)

                # todo 内容里面包含 mp.weixin 链接去重 name:一个程序员的日常
                urls = re.findall('mp.weixin.qq.com.*?#wechat_redirect',
                                  escape_url)
                prefix = 'https://mp.weixin.qq.com/s?'
                backpack_list = []
                article_count = 0
                for article_count, url in enumerate(urls):
                    # if article_count < 3:
                    #     continue
                    url = prefix + url.replace('amp;', '').replace(
                        r'mp.weixin.qq.com\\/s?', '')
                    log('article', url)
                    # 匹配出错跳过
                    if 'content_url' in url:
                        continue
                    article = Article()
                    article.create(url)
                    log("catch {}".format(article.title))
                    account = Account()
                    # account 读文件跟信源搜索不一样
                    account.name = article.author
                    account.account = article.account
                    account.get_account_id()
                    entity = JsonEntity(article, account)
                    backpack = Backpack()

                    # 文章为分享
                    try:
                        backpack.create(entity)
                    except Exception as e:
                        log(e)
                        continue
                    backpack_list.append(backpack.create_backpack())

                    # 上传数据库
                    sql = '''   
                        INSERT INTO 
                            account_http(article_url, addon, account, account_id, author, id, title) 
                        VALUES 
                            (%s, %s, %s, %s, %s, %s, %s)
                    '''
                    _tuple = (entity.url, datetime.datetime.now(),
                              entity.account, entity.account_id, entity.author,
                              entity.id, entity.title)
                    uploads_mysql(config_mysql, sql, _tuple)
                    # if article_count == 30:
                    #     break
                log('采集{}成功,共{}条文章'.format(self.name, article_count + 1))

                log("发包")
                if entity:
                    entity.uploads(backpack_list)
                    log("uploads successful")
示例#8
0
    def run(self):
        # self.set_name()
        # while True:
        account_list = ['大数据发布', '上海港湾集团', '绿盟365', '酌梦录', '瞄了个喵', '豪德通讯', '魔都娱乐1', '大侠的小宇宙', '澳洲梦', '盛世路跑', '佛系金融女',
                        '中卫今日热点', '金华社区居委会', '昕说法', '华农海洋研会', '尘埃一生', '革镇堡街道普法', '速度车行', '七分钟高清视频', '摘星少女酱',
                        '青海省格尔木市健桥医院', '乐用好车', '最强省钱喵喵君', '石柱港航', '荣盛物业长沙花语馨苑客服中心', '汕头超声集团', '中奥吴郡半岛', '隽永人生',
                        '飞鸿影视传媒', 'RGSE义乌雨具遮阳及防护用品展']

        articles = []
        ID = hash_md5(self.name)

        for name in account_list:
            if len(name) == 0:
                continue
            self.name = name

            html_account = self.account_homepage()
            if html_account:
                html, account_of_homepage = html_account
            else:
                continue
            log('start 公众号: ', self.name)
            urls_article = self.urls_article(html)

            account = Account()
            account.name = self.name
            account.account = account_of_homepage
            account.get_account_id()

            backpack_list = []
            for page_count, url in enumerate(urls_article):
                # if page_count < 35:
                #     continue
                article = Article()
                article.create(url, self.name)
                log('文章标题:', article.title)
                log("第{}条".format(page_count))

                entity = JsonEntity(article, account)
                backpack = Backpack()
                backpack.create(entity)
                backpack_list.append(backpack.create_backpack())

                # 所有文章
                article_info = backpack.to_dict()
                articles.append({ID: article_info})
                # 上传数据库
                import pymongo
                conn = pymongo.MongoClient('120.78.237.213', 27017)
                sql = '''
                        INSERT INTO
                            account_http(article_url, addon, account, account_id, author, id, title)
                        VALUES
                            (%s, %s, %s, %s, %s, %s, %s)
                '''
                _tuple = (
                    article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id,
                    entity.title
                )
                uploads_mysql(config_mysql, sql, _tuple)
                # if page_count == 5:
                #     break

        log("发包")
        if entity:
            entity.uploads(backpack_list)
示例#9
0
    def run(self):
        self.set_key_uin()
        while True:
            _biz_list = self.biz_list()
            if _biz_list:
                entity = None
                for biz in _biz_list:
                    try:
                        self._biz = biz
                        self.create_url()
                        print('添加成功')
                        # self.url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzA4OTEwNDUwMA==&uin=MTE1NjkxODg2MQ==&key=2e15abc1cc63c6472b3f9e24b445b1c19bb7dcee55cf4eb76c5363872c0f2f760899356828e84a3aeeb272cb0565257c52ac612c186648dbb4226484e2f04530a140a103860689fe7656df0d53f08ab5'
                        resp = requests.get(self.url, headers=self.headers)
                        # 响应结果为空即key失效
                        if len(resp.text) == 0:
                            log('key失效,获取新的key', self.url)
                            self.set_key_uin()
                            resp = requests.get(self.url, headers=self.headers)
                        else:
                            log('key有效,当前链接', self.url)
                        urls = self.urls_article(resp)

                        # 构建account
                        article = Article()
                        article.create(urls[0])
                        log("文章标题 {}".format(article.title))
                        # article.title = article.title.replace('【', '')
                        # article.title = article.title.replace('】', '')
                        # article.title = article.title.replace('!', '')

                        log(article.title)
                        account = Account()
                        account.name = article.author
                        # account.name = '中央纪委国家监委网站'
                        account.account = article.account
                        # account.account = 'gh_a78ef1e3d11e'
                        account.get_account_id()
                        account.account_id = 126774166
                        if not account.account:
                            log("错误,找不到account")

                        backpack_list = []
                        article_count = 0
                        for article_count, url in enumerate(urls):
                            log('文章链接', url)
                            article = Article()
                            article.create(url)
                            article.title = article.title.replace('.', '')
                            if '!' in article.title:
                                article.title = article.title.replace('!', '')
                            log("文章标题 {}".format(article.title))
                            entity = JsonEntity(article, account)
                            backpack = Backpack()

                            # 文章为分享,正则匹配不到时间,会异常
                            try:
                                backpack.create(entity)
                            except Exception as e:
                                log('share error', e)
                                continue
                            backpack_list.append(backpack.create_backpack())

                            # 上传数据库
                            sql = '''
                            INSERT INTO
                                account_http(article_url, addon, account, account_id, author, id, title)
                            VALUES
                                (%s, %s, %s, %s, %s, %s, %s)
                                    '''
                            _tuple = (entity.url, datetime.datetime.now(),
                                      entity.account, entity.account_id,
                                      entity.author, entity.id, entity.title)
                            uploads_mysql(config_mysql, sql, _tuple)
                            # if article_count == 4:
                            #     break
                        log('采集账号:{} 所有文章完毕,共{}条文章'.format(
                            self.name, article_count + 1))

                        log("发包")
                        if entity:
                            entity.uploads(backpack_list)
                            log("uploads successful")
                        print("end")
                        # 迭代一个账号
                        break
                    except Exception as e:
                        log('account error', e)
                        continue