def run(self): name_list = self.get_name() for name in name_list: # name = '大鼎豫剧' log('start {}'.format(name)) self.name = name _tuple = self.account_homepage(name) # 跳过 搜索不到的公众号 if _tuple: html, _account = self.account_homepage(name) else: log('not find {}'.format(self.name)) continue # 所有文章链接 items = re.findall('"content_url":".*?,"copyright_stat"', html) backpack_list = [] for page_count, item in enumerate(items): url_last = item[15:-18].replace('amp;', '') url = 'https://mp.weixin.qq.com' + url_last article = Article() article.create(url) if article.is_share is True: continue log("catch {}".format(article.title)) account = Account() # account 读文件跟信源搜索不一样 account.name = article.author account.account = article.account account.get_account_id() entity = JsonEntity(article, account) backpack = Backpack() # 文章为分享 # try: backpack.create(entity) # except Exception as e: # log(e) # continue backpack_list.append(backpack.create_backpack()) # 上传数据库 sql = ''' INSERT INTO account_http(article_url, addon, account, account_id, author, id, title) VALUES (%s, %s, %s, %s, %s, %s, %s) ''' _tuple = ( article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id, entity.title ) uploads_mysql(config_mysql, sql, _tuple) # if page_count == 30: # break log('catch {} successul 共{}条文章'.format(self.name, page_count)) log("发包") if entity: entity.uploads(backpack_list) log("uploads successful")
def run(self): # self.set_name() # while True: account_list = [ '晚聊伴夜', '氢氪财经', '菲迪克智慧工程企业管理平台', '山西同乡群', '筱猫影视', '沈阳南动车运用所', '潇湘茶', '众智睿赢企业管理咨询有限公司', '微景相册', '书悦堂', '分享好宝贝', '民艺旅舍', '女王Dcup', '轻松定位美丽', '乐清市红辣椒越剧艺苑', '畅舞馆', '人禾健康产业', '常州格物斯坦机器人创客中心', '千秋妃子', '崇左航博' ] for name in account_list: self.name = name html_account = self.account_homepage() if html_account: html, account_of_homepage = html_account else: continue log('start 公众号: ', self.name) urls_article = self.urls_article(html) account = Acount() account.name = self.name account.account = account_of_homepage account.get_account_id() backpack_list = [] for page_count, url in enumerate(urls_article): # if page_count < 35: # continue article = Article() article.create(url, self.name) log('文章标题:', article.title) log("第{}条".format(page_count)) entity = JsonEntity(article, account) backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) import pymongo conn = pymongo.MongoClient('mongo') # 上传数据库 sql = ''' INSERT INTO account_http(article_url, addon, account, account_id, author, id, title) VALUES (%s, %s, %s, %s, %s, %s, %s) ''' _tuple = (article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id, entity.title) uploads_mysql(config_mysql, sql, _tuple) # if page_count == 5: # break log("发包") if entity: entity.uploads(backpack_list)
def run(self): # self.set_name() # while True: account_list = ['有看投',] entity = None backpack_list = [] for name in account_list: self.name = name html_account = self.account_homepage() if html_account: html, account_of_homepage = html_account else: continue log('start 公众号: ', self.name) urls_article = self.urls_article(html) account = Account() account.name = self.name account.account = account_of_homepage account.get_account_id() # account.account_id = 126774646 for page_count, url in enumerate(urls_article): # if page_count < 35: # continue article = Article() article.create(url, account) log('文章标题:', article.title) log("第{}条".format(page_count)) entity = JsonEntity(article, account) backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) import pymongo conn = pymongo.MongoClient('mongo') # 上传数据库 sql = ''' INSERT INTO account_http(article_url, addon, account, account_id, author, id, title) VALUES (%s, %s, %s, %s, %s, %s, %s) ''' _tuple = ( article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id, entity.title ) uploads_mysql(config_mysql, sql, _tuple) if page_count == 4: break log("发包") if entity: # entity.uploads(backpack_list) # entity.uploads_datacenter_relay(backpack_list) entity.uploads_datacenter_unity(backpack_list) print('end')
def run(self): # self.set_name() # while True: account_list = global_account_list or [ '刀口谈兵', ] for name in account_list: self.name = name html_account = self.account_homepage() if html_account: html, account_of_homepage = html_account else: continue log('start 公众号: ', self.name) urls_article = self.urls_article(html) account = Account() account.name = self.name account.account = account_of_homepage # account.get_account_id() account.account_id = global_account_id or 126776905 entity = None backpack_list = [] for page_count, url in enumerate(urls_article): if page_count == 0: continue article = Article() article.create(url, self.name) log('文章标题:', article.title) log("第{}条".format(page_count)) entity = JsonEntity(article, account) backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # 上传数据库 sql = ''' INSERT INTO account_http(article_url, addon, account, account_id, author, id, title) VALUES (%s, %s, %s, %s, %s, %s, %s) ''' _tuple = (article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id, entity.title) uploads_mysql(config_mysql, sql, _tuple) if page_count == 5: break log("发包") if entity: entity.uploads(backpack_list) # entity.uploads_datacenter(backpack_list) print('end')
def run(self): biz = 'MjM5MDYxNzcwNA' self.url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={}==&uin=MTA2NjAyMjkyMA==&key=2d9f0071c5e3853997fb949a262f6e94880e0112444bb490548a0fa39b12232e3736b5d92b5f923541e1aa4a3dbd3c63d4bc9cbfc09dcb9911e82e9f497263a10535ea271c5479e87a2a7623e8179aa0'.format(biz) resp = requests.get(self.url, headers=self.headers) match_url = re.search('var msgList =.*?\';', resp.text).group() escape_url = html.unescape(match_url) urls = re.findall('content_url.*?mp.weixin.qq.com.*?#wechat_redirect', escape_url) prefix = 'https://mp.weixin.qq.com/s?' backpack_list = [] article_count = 0 for article_count, url in enumerate(urls): url = prefix + url.replace('amp;', '').replace(r'content_url":' r'"http:\\/\\/mp.weixin.qq.com\\/s?', '') log('文章链接', url) article = Article() article.create(url) log("文章标题 {}".format(article.title)) account = Acount() # account 读文件跟信源搜索不一样 account.name = article.author account.account = article.account account.get_account_id() entity = JsonEntity(article, account) backpack = Backpack() # 文章为分享,正则匹配不到时间,会异常 try: backpack.create(entity) except Exception as e: log('share error', e) continue backpack_list.append(backpack.create_backpack()) # 上传数据库 sql = ''' INSERT INTO account_http(article_url, addon, account, account_id, author, id, title) VALUES (%s, %s, %s, %s, %s, %s, %s) ''' _tuple = ( entity.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id, entity.title ) uploads_mysql(config_mysql, sql, _tuple) # # if article_count == 30: # # break log('采集账号:{} 所有文章完毕,共{}条文章'.format(self.name, article_count + 1)) log("发包") if entity: entity.uploads(backpack_list) log("uploads successful")
def save_to_mysql(entity): # 上传数据库 # log.info('开始上传mysql') sql = ''' INSERT INTO account_http(article_url, addon, account, account_id, author, id, title) VALUES (%s, %s, %s, %s, %s, %s, %s) ''' _tuple = (entity.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id, entity.title) try: config_mysql = get_mysql_new() uploads_mysql(config_mysql, sql, _tuple) except Exception as e: log.info('数据库上传错误 {}'.format(e))
def run(self): self.set_key_uin() _biz_list = self.biz_list() if _biz_list: for biz in _biz_list: self._biz = biz self.create_url() log(self.url) resp = requests.get(self.url, headers=self.headers) # 响应结果为空 if len(resp.text) == 0: self.set_key_uin() resp = requests.get(self.url, headers=self.headers) log('response 为空') break match_url = re.search('var msgList =.*?\';', resp.text).group() escape_url = html.unescape(match_url) # todo 内容里面包含 mp.weixin 链接去重 name:一个程序员的日常 urls = re.findall('mp.weixin.qq.com.*?#wechat_redirect', escape_url) prefix = 'https://mp.weixin.qq.com/s?' backpack_list = [] article_count = 0 for article_count, url in enumerate(urls): # if article_count < 3: # continue url = prefix + url.replace('amp;', '').replace( r'mp.weixin.qq.com\\/s?', '') log('article', url) # 匹配出错跳过 if 'content_url' in url: continue article = Article() article.create(url) log("catch {}".format(article.title)) account = Account() # account 读文件跟信源搜索不一样 account.name = article.author account.account = article.account account.get_account_id() entity = JsonEntity(article, account) backpack = Backpack() # 文章为分享 try: backpack.create(entity) except Exception as e: log(e) continue backpack_list.append(backpack.create_backpack()) # 上传数据库 sql = ''' INSERT INTO account_http(article_url, addon, account, account_id, author, id, title) VALUES (%s, %s, %s, %s, %s, %s, %s) ''' _tuple = (entity.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id, entity.title) uploads_mysql(config_mysql, sql, _tuple) # if article_count == 30: # break log('采集{}成功,共{}条文章'.format(self.name, article_count + 1)) log("发包") if entity: entity.uploads(backpack_list) log("uploads successful")
def run(self): # self.set_name() # while True: account_list = ['大数据发布', '上海港湾集团', '绿盟365', '酌梦录', '瞄了个喵', '豪德通讯', '魔都娱乐1', '大侠的小宇宙', '澳洲梦', '盛世路跑', '佛系金融女', '中卫今日热点', '金华社区居委会', '昕说法', '华农海洋研会', '尘埃一生', '革镇堡街道普法', '速度车行', '七分钟高清视频', '摘星少女酱', '青海省格尔木市健桥医院', '乐用好车', '最强省钱喵喵君', '石柱港航', '荣盛物业长沙花语馨苑客服中心', '汕头超声集团', '中奥吴郡半岛', '隽永人生', '飞鸿影视传媒', 'RGSE义乌雨具遮阳及防护用品展'] articles = [] ID = hash_md5(self.name) for name in account_list: if len(name) == 0: continue self.name = name html_account = self.account_homepage() if html_account: html, account_of_homepage = html_account else: continue log('start 公众号: ', self.name) urls_article = self.urls_article(html) account = Account() account.name = self.name account.account = account_of_homepage account.get_account_id() backpack_list = [] for page_count, url in enumerate(urls_article): # if page_count < 35: # continue article = Article() article.create(url, self.name) log('文章标题:', article.title) log("第{}条".format(page_count)) entity = JsonEntity(article, account) backpack = Backpack() backpack.create(entity) backpack_list.append(backpack.create_backpack()) # 所有文章 article_info = backpack.to_dict() articles.append({ID: article_info}) # 上传数据库 import pymongo conn = pymongo.MongoClient('120.78.237.213', 27017) sql = ''' INSERT INTO account_http(article_url, addon, account, account_id, author, id, title) VALUES (%s, %s, %s, %s, %s, %s, %s) ''' _tuple = ( article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id, entity.title ) uploads_mysql(config_mysql, sql, _tuple) # if page_count == 5: # break log("发包") if entity: entity.uploads(backpack_list)
def run(self): self.set_key_uin() while True: _biz_list = self.biz_list() if _biz_list: entity = None for biz in _biz_list: try: self._biz = biz self.create_url() print('添加成功') # self.url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzA4OTEwNDUwMA==&uin=MTE1NjkxODg2MQ==&key=2e15abc1cc63c6472b3f9e24b445b1c19bb7dcee55cf4eb76c5363872c0f2f760899356828e84a3aeeb272cb0565257c52ac612c186648dbb4226484e2f04530a140a103860689fe7656df0d53f08ab5' resp = requests.get(self.url, headers=self.headers) # 响应结果为空即key失效 if len(resp.text) == 0: log('key失效,获取新的key', self.url) self.set_key_uin() resp = requests.get(self.url, headers=self.headers) else: log('key有效,当前链接', self.url) urls = self.urls_article(resp) # 构建account article = Article() article.create(urls[0]) log("文章标题 {}".format(article.title)) # article.title = article.title.replace('【', '') # article.title = article.title.replace('】', '') # article.title = article.title.replace('!', '') log(article.title) account = Account() account.name = article.author # account.name = '中央纪委国家监委网站' account.account = article.account # account.account = 'gh_a78ef1e3d11e' account.get_account_id() account.account_id = 126774166 if not account.account: log("错误,找不到account") backpack_list = [] article_count = 0 for article_count, url in enumerate(urls): log('文章链接', url) article = Article() article.create(url) article.title = article.title.replace('.', '') if '!' in article.title: article.title = article.title.replace('!', '') log("文章标题 {}".format(article.title)) entity = JsonEntity(article, account) backpack = Backpack() # 文章为分享,正则匹配不到时间,会异常 try: backpack.create(entity) except Exception as e: log('share error', e) continue backpack_list.append(backpack.create_backpack()) # 上传数据库 sql = ''' INSERT INTO account_http(article_url, addon, account, account_id, author, id, title) VALUES (%s, %s, %s, %s, %s, %s, %s) ''' _tuple = (entity.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id, entity.title) uploads_mysql(config_mysql, sql, _tuple) # if article_count == 4: # break log('采集账号:{} 所有文章完毕,共{}条文章'.format( self.name, article_count + 1)) log("发包") if entity: entity.uploads(backpack_list) log("uploads successful") print("end") # 迭代一个账号 break except Exception as e: log('account error', e) continue