def main(): print('Program initiating ... ...') print() status = True while status: try: # ============= 测试Connection ============= mydb = connectDB() mycursor = mydb.cursor() mycursor.execute('SELECT * FROM ttd.news LIMIT 10;') print(len(mycursor.fetchall()), ' Connection works') print() # ============= 测试Connection END ============= r = requests.get('http://finance.people.com.cn/') r.encoding = 'GBK' soup = BeautifulSoup(r.content, features="html.parser") result = [] # 版面头条 try: pageMainHeadURL = 'http://finance.people.com.cn/' + soup.find('div', {'class': 'title mt15'}).find('a').get('href') result.append(pageMainHeadURL) except: print('Main page heading extraction error') print() # 新闻盒子组 newsBoxList = soup.findAll('div', {'class': 'news_box'}) for item in newsBoxList: sectionHeadURL = 'http://finance.people.com.cn/' + item.find('a').get('href') result.append(sectionHeadURL) # 找到子目录 news_box minorList = item.findAll('a') for a in minorList: minorHeadURL = 'http://finance.people.com.cn/' + a.get('href') result.append(minorHeadURL) # 切换新闻组 qiehuanList = soup.findAll('div', {'class': 'headingNews qiehuan1_c'}) for qiehuan in qiehuanList: minorList = qiehuan.findAll('div', {'class': 'hdNews clearfix'}) for item in minorList: minorHeadURL = item.find('a').get('href') if 'http' not in minorHeadURL: minorHeadURL = 'http://finance.people.com.cn/' + minorHeadURL result.append(minorHeadURL) print('This round the result has ', len(result), ' items') print() # ======== 与数据库对比是否有重复 ========= new_result = [] for link in result: try: sql = 'SELECT news_id FROM ttd.news WHERE news_link=\'' + str(link).strip() + '\';' mycursor.execute(sql) news_link = mycursor.fetchall() if len(news_link) < 1: print('New link found', link) print() new_result.append(link) else: print(link, ' Existed') print() except: print('Adding new item error') print() break print('This round has ', len(new_result), ' new items') print() # ======== 与数据库对比是否有重复 END ========= # ======== 插入新数据 ========= if len(new_result) == 0: majorRandomPause() else: for link in new_result: try: mycursor.execute('SELECT news_id FROM ttd.news ORDER BY news_id DESC LIMIT 1;') print('Execute Successfully') news_id_count = mycursor.fetchall()[0][0] + 1 print(news_id_count) sql = 'INSERT INTO ttd.news (news_id, news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' rst = parsingContent(link) # ======= 政府标签 - 新增 12.15 ========== gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content'])) com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content'])) # ======= 政府标签 - 新增 12.15 END ========== val = (news_id_count, str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag) mycursor.execute(sql, val) mydb.commit() print(mycursor.rowcount, "record inserted.") print() minorRandomPause() except: print('Getting info error') print() break # ======== 插入新数据 END ========= print('This round has end, close connection') print() mydb.close() except: print('An error happend, minor stop') print() mydb.close() minorRandomPause() print('Reconnecting') print() mydb = connectDB() mycursor = mydb.cursor()
def main(): print('Program initiating ... ...') print() status = True while status: try: # ============= 测试Connection ============= mydb = connectDB() mycursor = mydb.cursor() mycursor.execute('SELECT * FROM ttd.news LIMIT 10;') print(len(mycursor.fetchall()), ' Connection works') print() # ============= 测试Connection END ============= r = requests.get('http://www.sasac.gov.cn/n2588025/index.html') # URL 地址 soup = BeautifulSoup(r.content, features="html.parser") result = [] # 头版靠左 links = soup.findAll('div', {'class': 'wrz-alist'}) for link in links[:-1]: tmp = link.findAll('li') for item in tmp: if 'http' in item.find('a').get('href'): result.append(item.find('a').get('href')) else: result.append('http://www.sasac.gov.cn/' + item.find('a').get('href').replace('..', '')) print('This round the result has ', len(result), ' items') print() # ======== 与数据库对比是否有重复 ========= new_result = [] for link in result: try: sql = 'SELECT news_id FROM ttd.news WHERE news_link=\'' + str(link).strip() + '\';' mycursor.execute(sql) news_link = mycursor.fetchall() if len(news_link) < 1: print('New link found', link) print() new_result.append(link) else: print(link, ' Existed') print() except: print('Adding new item error') print() break print('This round has ', len(new_result), ' new items') print() # ======== 与数据库对比是否有重复 END ========= # ======== 插入新数据 ========= if len(new_result) == 0: majorRandomPause() else: for link in new_result: try: mycursor.execute('SELECT news_id FROM ttd.news ORDER BY news_id DESC LIMIT 1;') print('Execute Successfully') news_id_count = mycursor.fetchall()[0][0] + 1 print(news_id_count) sql = 'INSERT INTO ttd.news (news_id, news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' rst = parsingContent(link) # ======= 政府标签 - 新增 12.15 ========== gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content'])) com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content'])) # ======= 政府标签 - 新增 12.15 END ========== val = (news_id_count, str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag) mycursor.execute(sql, val) mydb.commit() print(mycursor.rowcount, "record inserted.") print() minorRandomPause() except: print('Getting info error') print() break # ======== 插入新数据 END ========= print('This round has end, close connection') print() mydb.close() except: print('An error happend, minor stop') print() mydb.close() minorRandomPause() print('Reconnecting') print() mydb = connectDB() mycursor = mydb.cursor()
def main(): print('新华网财经') print() # ============= 测试Connection ============= mydb = connectDB() mycursor = mydb.cursor() # ============= 测试Connection END ============= try: r = requests.get('http://xinhuanet.com/fortunepro/') soup = BeautifulSoup(r.content, features='html.parser') except: return # ============== 主页面爬取 ================ news_list = soup.find('ul', {'class': 'silder_nav clearfix'}).findAll('li') news_list_item = {} # news_list_item_belong = {} for i in news_list: # currentLi = i.findAll('a') # for a in currentLi: # news_list_item[a.find('a').text] = a.find('a').get('href') try: news_list_item[i.find('a').get('href')] = i.find('a').text except: pass # news_list_item_belong[a.find('a').get('href')] = a.find('div', {'class': 'dysMiddleResultConItemRelevant clearfix'}).text print('共', len(news_list_item), '个结果') print() # ============== 主页面爬取 END =============== # ============== 数据库对照 ================= confirmed_new = [] for a in news_list_item: try: sql = 'SELECT news_title, news_link FROM ttd.news WHERE news_title =\'' + str( a) + '\' or news_link = \'' + str(news_list_item[a]) + '\';' mycursor.execute(sql) compareResult = mycursor.fetchall() if len(compareResult) == 0: confirmed_new.append(a) else: pass except: lw.log_writer('新华网财经添加新闻错误') pass lw.log_writer('新华网财经本轮新增新闻有' + str(len(confirmed_new)) + '条') # ============== 数据库对照 END ================= # ============== 爬取主代码 ================= if len(confirmed_new) == 0: print('没有发现新增新闻,即将关闭DB链接') print() mydb.close() else: for link in confirmed_new: sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' rst = parsingContent(link, news_list_item[link]) # ======= 标签 - 新增 12.15 ========== gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content'])) com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content'])) topic_tag = module_news_topicTag.tagTopic(mycursor, str(rst['news_title']), str(rst['news_content'])) # ======= 标签 - 新增 12.15 END ========== val = (str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag, topic_tag) try: mycursor.execute(sql, val) mydb.commit() except: lw.log_writer('新华网财经在添加数据时失败') lw.log_writer('新华网财经新增' + str(mycursor.rowcount) + '条') minorRandomPause() lw.log_writer('新华网财经本轮结束') mydb.close()
def main(): print('中共中央政法委员会') print() # ============= 测试Connection ============= mydb = connectDB() mycursor = mydb.cursor() # ============= 测试Connection END ============= try: r = requests.get( 'http://www.chinapeace.gov.cn/chinapeace/c100008/list2020.shtml') soup = BeautifulSoup(r.content, features='html.parser') except: return # ============== 主页面爬取 ================ news_list = soup.find('div', { 'class': "w1200 bgfff" }).find('div', { 'class': 'list_box_left' }).findAll('li') news_list_item = {} for i in news_list: currentLi = i.findAll('a') for a in currentLi: news_list_item[a.text] = a.get('href') print('共', len(news_list_item), '个结果') print() # ============== 主页面爬取 END =============== # ============== 数据库对照 ================= confirmed_new = [] for a in news_list_item: try: sql = 'SELECT news_title, news_link FROM ttd.news WHERE news_title =\'' + str( a) + '\' or news_link = \'' + str(news_list_item[a]) + '\';' mycursor.execute(sql) compareResult = mycursor.fetchall() if len(compareResult) == 0: confirmed_new.append(news_list_item[a]) else: pass except: lw.log_writer('中央政协委员会首页添加新闻错误') pass lw.log_writer('中央政协委员会本轮新增新闻有' + str(len(confirmed_new)) + '条') # ============== 数据库对照 END ================= # ============== 爬取主代码 ================= if len(confirmed_new) == 0: print('没有发现新增新闻,即将关闭DB链接') print() mydb.close() else: for link in confirmed_new: sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' rst = parsingContent(link) # ======= 标签 - 新增 12.15 ========== gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content'])) com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content'])) topic_tag = module_news_topicTag.tagTopic(mycursor, str(rst['news_title']), str(rst['news_content'])) # ======= 标签 - 新增 12.15 END ========== val = (str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag, topic_tag) try: mycursor.execute(sql, val) mydb.commit() except: lw.log_writer('中央政协委员会在添加数据时失败') lw.log_writer('中央政协新增' + str(mycursor.rowcount) + '条') minorRandomPause() lw.log_writer('中央政协本轮结束') mydb.close()
def main(): print('Program initiating ... ...') print() status = True while status: try: # ============= 测试Connection ============= mydb = connectDB() mycursor = mydb.cursor() mycursor.execute('SELECT * FROM ttd.news LIMIT 10;') print(len(mycursor.fetchall()), ' Connection works') print() # ============= 测试Connection END ============= r = requests.get('http://caijing.chinadaily.com.cn/') # URL 地址 soup = BeautifulSoup(r.content, features="html.parser") result = [] # 储存结果 # 获取这一页所有title # 头版靠右 topRightLinks = soup.find('div', { 'class': 'yaowen' }).findAll('a')[1:] for link in topRightLinks: if '//' in link.get('href'): result.append(link.get('href').replace('//', 'https://')) # 跨国公司 left_liebiao_1 = soup.find('div', { 'class': 'left-liebiao' }).findAll('div', {'class': 'busBox1'}) for link in left_liebiao_1: result.append( link.find('a').get('href').replace('//', 'https://')) # 产业与公司板块 left_liebiao_2 = soup.findAll( 'div', {'class': 'left-liebiao'})[1].findAll('div', {'class': 'busBox1'}) for link in left_liebiao_2: result.append( link.find('a').get('href').replace('//', 'https://')) print('This round the result has ', len(result), ' items') print() # ======== 与数据库对比是否有重复 ========= new_result = [] for link in result: try: sql = 'SELECT news_id FROM ttd.news WHERE news_link=\'' + str( link).strip() + '\';' mycursor.execute(sql) news_link = mycursor.fetchall() if len(news_link) < 1: print('New link found', link) print() new_result.append(link) else: print(link, ' Existed') print() except: print('Adding new item error') print() break print('This round has ', len(new_result), ' new items') print() # ======== 与数据库对比是否有重复 END ========= # ======== 插入新数据 ========= if len(new_result) == 0: majorRandomPause() else: for link in new_result: try: mycursor.execute( 'SELECT news_id FROM ttd.news ORDER BY news_id DESC LIMIT 1;' ) print('Execute Successfully') news_id_count = mycursor.fetchall()[0][0] + 1 print(news_id_count) sql = 'INSERT INTO ttd.news (news_id, news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' rst = parsingContent(link) # ======= 政府标签 - 新增 12.15 ========== gov_tag = module_news_govTag.tagGov( mycursor, str(rst['news_title']), str(rst['news_content'])) com_tag = module_news_comTag.tagCom( mycursor, str(rst['news_title']), str(rst['news_content'])) # ======= 政府标签 - 新增 12.15 END ========== val = (news_id_count, str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag) mycursor.execute(sql, val) mydb.commit() print(mycursor.rowcount, "record inserted.") print() minorRandomPause() except: print('Getting info error') print() break # ======== 插入新数据 END ========= print('This round has end, close connection') print() mydb.close() except: print('An error happend, minor stop') print() mydb.close() minorRandomPause() print('Reconnecting') print() mydb = connectDB() mycursor = mydb.cursor()
def main(): print('Program initiating ... ...') print() status = True while status: try: # ============= 测试Connection ============= mydb = connectDB() mycursor = mydb.cursor() mycursor.execute('SELECT * FROM ttd.news LIMIT 10;') print(len(mycursor.fetchall()), ' Connection works') print() # ============= 测试Connection END ============= r = requests.get('http://www.caixin.com/') soup = BeautifulSoup(r.content, features='html.parser') result = [] # 储存结果 # 主页面 main_list = soup.find('div', {'class': 'news_list'}).findAll('dl') for item in main_list: result.append(item.find('dd').find('p').find('a').get('href')) print('This round has ', len(result), ' items') print() # 与数据库比照是否有重复 new_result = [] for link in result: try: sql = 'SELECT news_id FROM ttd.news WHERE news_link=\'' + str( link).strip() + '\';' mycursor.execute(sql) news_link = mycursor.fetchall() if len(news_link) < 1: new_result.append(link) except: print('Adding new item error') print() break print('This round has ', len(new_result), ' new items') print() if len(new_result) == 0: print('No new item found, restart DB connection') print() mydb.close() majorRandomPause() else: for link in new_result: try: mycursor.execute( 'SELECT news_id FROM ttd.news ORDER BY news_id DESC LIMIT 1;' ) print('Execute Successfully') news_id_count = mycursor.fetchall()[0][0] + 1 print(news_id_count) sql = 'INSERT INTO ttd.news (news_id, news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' rst = parsingContent(link) # ======= 标签 - 新增 12.15 ========== gov_tag = module_news_govTag.tagGov( mycursor, str(rst['news_title']), str(rst['news_content'])) com_tag = module_news_comTag.tagCom( mycursor, str(rst['news_title']), str(rst['news_content'])) # ======= 标签 - 新增 12.15 END ========== val = (news_id_count, str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag) mycursor.execute(sql, val) mydb.commit() print(mycursor.rowcount, "record inserted.") print() minorRandomPause() except: print('Getting info error') print() break print('This round has end, close connection') print() mydb.close() except: print('An error happend, minor stop') print() mydb.close() minorRandomPause() print('Reconnecting') print() mydb = connectDB() mycursor = mydb.cursor()
def main(): print('财新网') print() # ============= 测试Connection ============= mydb = connectDB() mycursor = mydb.cursor() mycursor.execute('SELECT * FROM ttd.news LIMIT 10;') # ============= 测试Connection END ============= r = requests.get('http://www.caixin.com/') soup = BeautifulSoup(r.content, features='html.parser') main_page_item = {} # 用于储存全部该页面数据 # ============= 主页面爬取 ============= main_list = soup.find('div', {'class': 'news_list'}).findAll('dl') for item in main_list: a = item.find('dd').find('p').find('a') main_page_item[a.text] = a.get('href') print('This round has ', len(main_page_item), ' items') print() # ============= 主页面爬取 END ============= # ============== 数据库对照 ================= confirmed_new = [] for a in main_page_item: try: sql = 'SELECT news_id, news_title FROM ttd.news WHERE news_title=\'' + str( a) + '\';' mycursor.execute(sql) compareResult = mycursor.fetchall() if len(compareResult) == 0: confirmed_new.append(main_page_item[a]) else: pass except: print('添加新的新闻错误') print() pass print('本轮新的新闻有', len(confirmed_new), '条') # ============== 数据库对照 END ================= if len(confirmed_new) == 0: print('没有发现新增新闻,即将关闭DB链接') print() mydb.close() else: for link in confirmed_new: try: sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' rst = parsingContent(link) # ======= 标签 - 新增 12.15 ========== gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content'])) com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content'])) topic_tag = module_news_topicTag.tagTopic( mycursor, str(rst['news_title']), str(rst['news_content'])) # ======= 标签 - 新增 12.15 END ========== val = (str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag, topic_tag) mycursor.execute(sql, val) mydb.commit() lw.log_writer('财新网' + str(mycursor.rowcount) + '条') minorRandomPause() except: print('Getting info error') print() break lw.log_writer('财新网脚本本轮结束') mydb.close()
def main(): print('Program initating ... ...') print() status = True while status: try: # ============= 测试Connection ============= mydb = connectDB() mycursor = mydb.cursor() mycursor.execute('SELECT * FROM ttd.news LIMIT 10;') print(len(mycursor.fetchall()), ' Connection works') print() # ============= 测试Connection END ============= r = requests.get('http://finance.china.com.cn/') soup = BeautifulSoup(r.content, features="html.parser") results = [] # 储存结果 # 获取这一页所有title # 头版靠左 headLinks = soup.find('div', {'class': 'hot c'}).find('div', {'fl hot-lf'}).findAll('a') for link in headLinks: if 'http' in link.get('href') and 'photo' not in link.get('href'): results.append(link.get('href')) # 金融和资本板块 f_and_c = soup.find('div', {'class': 'mt20'}).find('div', {'class': 'fl f-list'}).findAll('a') for link in f_and_c: if len(link.get('href')) > 26 and 'photo' not in link.get('href'): results.append(link.get('href')) # 产业与公司板块 i_and_c = soup.find('div', {'class': 'indus mt20'}).find('div', {'class': 'fl f-list pt10'}).findAll('a') for link in i_and_c: if 'finance' in link.get('href') and 'photo' not in link.get('href'): results.append(link.get('href')) print('This round the result has ', len(results), ' items') new_results = [] # ======== 与数据库对比是否有重复 ========= new_results = [] for link in results: try: sql = 'SELECT news_id FROM ttd.news WHERE news_link=\'' + str(link).strip() + '\';' mycursor.execute(sql) news_link = mycursor.fetchall() if len(news_link) < 1: print('New link found', link) print() new_results.append(link) else: print(link, ' Existed') print() except: print('Adding new item error') print() break print('This round has ', len(new_results), ' new items') print() # ======== 与数据库对比是否有重复 END ========= # ======== 插入新数据 ========= if len(new_results) == 0: majorRandomPause() else: for link in new_results: try: mycursor.execute('SELECT news_id FROM ttd.news ORDER BY news_id DESC LIMIT 1;') print('Execute Successfully') news_id_count = mycursor.fetchall()[0][0] + 1 print(news_id_count) sql = 'INSERT INTO ttd.news (news_id, news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' rst = parsingContent(link) # ======= 政府标签 - 新增 12.15 ========== gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content'])) com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content'])) # ======= 政府标签 - 新增 12.15 END ========== val = (news_id_count, str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag) mycursor.execute(sql, val) mydb.commit() print(mycursor.rowcount, "record inserted.") print() minorRandomPause() except: print('Getting info error') print() break # ======== 插入新数据 END ========= print('This round has end, close connection') print() mydb.close() except: print('An error happend, minor stop') print() mydb.close() minorRandomPause() print('Reconnecting') print() mydb = connectDB() mycursor = mydb.cursor()
def main(): print('天天基金网新闻') print() # ============= 测试Connection ============= mydb = connectDB() mycursor = mydb.cursor() mycursor.execute('SELECT * FROM ttd.news LIMIT 10;') print(len(mycursor.fetchall()), ' Connection works') print() # ============= 测试Connection END ============= r = requests.get('http://fund.eastmoney.com/a/cjjyw.html') soup = BeautifulSoup(r.content, features = 'html.parser') # ============== 主页面爬取 ============== main_list = soup.find('div', {'class': 'mainCont'}).findAll('ul') # 此处包含页面4个ul main_page_item = {} # 用于储存全部该页面的数据 for i in main_list: currentUl = i.findAll('a') for a in currentUl: main_page_item[a.text] = a.get('href') print('共', len(main_page_item), '个结果') print() # ============== 主页面爬取 END ============== # ============== 数据库对照 ================= confirmed_new = [] for a in main_page_item: try: sql = 'SELECT news_id, news_title FROM ttd.news WHERE news_title=\'' + str(a) + '\';' mycursor.execute(sql) compareResult = mycursor.fetchall() if len(compareResult) == 0: confirmed_new.append(main_page_item[a]) else: pass except: lw.log_writer('东方财富脚本首页添加新闻错误') pass lw.log_writer('东方财富脚本本轮新增新闻有' + str(len(confirmed_new)) + '条') # ============== 数据库对照 END ================= # ============== 爬取主代码 ================= if len(confirmed_new) == 0: print('没有发现新增新闻,即将关闭DB链接') print() mydb.close() else: for link in confirmed_new: sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' rst = parsingContent(link) # ======= 标签 - 新增 12.15 ========== gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content'])) com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content'])) topic_tag = module_news_topicTag.tagTopic(mycursor, str(rst['news_title']), str(rst['news_content'])) # ======= 标签 - 新增 12.15 END ========== val = (str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag, topic_tag) try: mycursor.execute(sql, val) mydb.commit() except: lw.log_writer('东方财富脚本在添加数据时失败') lw.log_writer('东方财富脚本新增' + str(mycursor.rowcount) + '条') minorRandomPause() lw.log_writer('东方财富脚本轮结束') mydb.close()
def main(): print('中国证券网') print() # ============= 测试Connection ============= mydb = connectDB() mycursor = mydb.cursor() mycursor.execute('SELECT * FROM ttd.news LIMIT 10;') print(len(mycursor.fetchall()), ' Connection works') print() # ============= 测试Connection END ============= r = requests.get('http://www.cs.com.cn/') soup = BeautifulSoup(r.content, features='html.parser') # ============== 主页面爬取 ============== main_page_item = {} top_part = soup.find('div', { 'class': 'box410 ch_focus space_l1' }).findAll('li') for i in top_part: try: if 'http' not in i.find('a').get('href'): main_page_item[i.text] = i.find('a').get('href') except: pass mid_part = soup.find('div', {'class': 'box_l1'}).findAll('li') for i in mid_part: try: if 'http' not in i.find('a').get('href'): main_page_item[i.text] = i.find('a').get('href') except: pass print('共', len(main_page_item), '个结果') print() # ============== 主页面爬取 END ============== # ============== 数据库对照 ================= confirmed_new = [] for a in main_page_item: try: sql = 'SELECT news_id, news_title FROM ttd.news WHERE news_title=\'' + str( a) + '\';' mycursor.execute(sql) compareResult = mycursor.fetchall() if len(compareResult) == 0: confirmed_new.append(main_page_item[a]) else: pass except: lw.log_writer('中国证券报首页添加新闻错误') pass print('本轮新的新闻有', len(confirmed_new), '条') # ============== 数据库对照 END ================= # ============== 爬取主代码 ================= if len(confirmed_new) == 0: print('没有发现新增新闻,即将关闭DB链接') print() mydb.close() else: for link in confirmed_new: sql = 'INSERT INTO ttd.news (news_title, news_source, news_date, news_content, news_link, gov_tag, com_tag, topic_tag) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)' rst = parsingContent(link[1:]) # ======= 标签 - 新增 12.15 ========== gov_tag = module_news_govTag.tagGov(mycursor, str(rst['news_title']), str(rst['news_content'])) com_tag = module_news_comTag.tagCom(mycursor, str(rst['news_title']), str(rst['news_content'])) topic_tag = module_news_topicTag.tagTopic(mycursor, str(rst['news_title']), str(rst['news_content'])) # ======= 标签 - 新增 12.15 END ========== val = (str(rst['news_title']), str(rst['news_source']), str(rst['news_date']), str(rst['news_content']), str(rst['news_link']), gov_tag, com_tag, topic_tag) mycursor.execute(sql, val) mydb.commit() lw.log_writer('中国证券报' + str(mycursor.rowcount) + '条') minorRandomPause() lw.log_writer('中国证券报脚本本轮结束') mydb.close()