def read_detial(url, i): detial_html = htmlSource.get_html(url_p=url, type_p='rg') #print(detial_html) # 写html files = File_file() names = url.split('/') file_name = names[len(names) - 1] files.save_source(path=path, file=file_name, all_the_text=detial_html, encoding_='utf-8') colum = [ ('title', '//h1[@class="articleHead"]//text()', 'l'), ('pushDate', '//div[@class="info"]//span[@class="infoA"][@id="pubtime_baidu"]//text()', 'l'), ('content', '//div[@class="articleText"]//text()', 'sarra', ',') ] result = rule.html_content_analysis_detial(html_text=detial_html, column=colum, url=url) print(result) #sql="insert into cancer value('%s','%s','%s','%s','%s')"%(result[0][1][0],str(result[1][1][0]).replace('患者,图片因隐私问题无法显示','').replace("患者,","患者:").replace("医生,","医生:").replace('\'','"'),type,'春雨医生',url) #print(sql) # 写文件 # web_name(网站名)、web_url(网址)、titile(标题)、text(新闻内容)、publish_date(发布时间) csv = Csv_base() csv.write_csv_file_line(file_path=path + "/data.csv", str=[ '参考消息', url, result[0][1], result[1][1], result[2][1], i, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) ])
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://mil.huanqiu.com/world/%s.html" for i in range(1, 101): s = 'index' if (i == 1): s = 'index' else: s = i url = start_url % (s) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="fallsFlow"]//ul//li//a//@href', 'l')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://www.81.cn/rd/node_92585%s.htm" for i in range(1, 6): ss = '' if (i == 1): ss = '' else: ss = '_%d' % i url = start_url % (ss) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="content"]//ul//li//a//@href', 'sab', 'http://www.81.cn/rd/')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://junshi.xilu.com/dfjs/index_1372_%d.html" for i in range(1, 101): url = start_url % (i) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [( 'a', '//div[@class="newslist_box"]//ul//li//div[@class="newslist_tit"]//a//@href', 'l')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://www.cankaoxiaoxi.com/mil/gjjq/%d.shtml" for i in range(1, 101): url = start_url % (i) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [( 'a', '//div[@class="inner"]//ul[@class="txt-list-a fz-14"]//li//a//@href', 'sab', '')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
def runDetail(self, file_path, confs, listconfs): csv = Csv_base() # 数据读取 for listconf in listconfs: dic_list = [] for row in listconf['columns']: dic_list.append(row['名称']) listList = csv.read_csv_file_dict(file_path=file_path, fieldnames=dic_list) for row in listList: self.crawlerDetail(confs, url=row[confs['urlname']])
def runDict(self, url, conf): rule = Rule() result = rule.crawler_list(url, conf) # 数据入库 TODO dic_list = [] for row in conf['columns']: dic_list.append(row['名称']) # 文件写入文件 csv = Csv_base() csv.write_csv_file_dictLines(file_path='../data/xuexi111.csv', strs=result, fieldnames=dic_list)
def runList(self, file_path, confs, dictconf): csv = Csv_base() # 数据读取 dic_list = [] for row in dictconf['columns']: dic_list.append(row['名称']) dictList = csv.read_csv_file_dict(file_path=file_path, fieldnames=dic_list) # 数据写入 for dict in dictList: for conf in confs: self.crawlerNext(conf, url=dict[conf['urlname']])
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://qc.wa.news.cn/nodeart/list?nid=11139636&pgnum=%d&cnt=1000&tp=1&orderby=1" for i in range(1, 6): url = start_url % (i) read_detial(url, i)
def crawlerNext(self, conf, url=''): rule = Rule() csv = Csv_base() list_list = [] for row in conf['columns']: list_list.append(row['名称']) result, next_page = rule.crawler_list(url, conf) print(result) if (len(result) > 0): csv.write_csv_file_dictLines(file_path='../data/xuexi111List.csv', strs=result, fieldnames=list_list) if (next_page): self.crawlerNext(conf, url=next_page)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line(file_path=path+"/data.csv",mode='w+',str=['网站名','网址','标题','新闻内容','发布时间','页码','采集时间']) # 爬虫 start_url = "https://www.thepaper.cn/load_index.jsp?nodeids=25430&topCids=&pageidx=%d&isList=true&lastTime=1550211962471" for i in range(1,26): url = start_url%(i) #print(url) list_html = htmlSource.get_html(url_p=url,type_p='rg') #print(list_html) colum=[('a','//div[@class="news_li"]//h2//a//@href','sab','https://www.thepaper.cn/')] list = rule.html_content_analysis_detial(html_text=list_html,column=colum,url=url) #print(list) for a in list[0][1]: read_detial(a,i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line(file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '采集时间']) # 爬虫 start_url = "https://military.china.com/news/" #print(url) list_html = htmlSource.get_html(url_p=start_url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="column-list"]//h3[@class="tit"]//a//@href', 'l')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=start_url) #print(list) for a in list[0][1]: read_detial(a)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://www.dsti.net/Information/HyeList/arms/%d" for i in range(1, 815): url = start_url % (i) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="listMidContent"]//ul//li//a//@href', 'sab', 'http://www.dsti.net')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://mil.news.sina.com.cn/roll/index.d.html?cid=57919&page=%d" for i in range(1, 26): url = start_url % (i) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="fixList"]//ul//li//a//@href', 'l')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: if (a[len(a) - 6:] == '.shtml'): read_detial(a, i)
def main(): floder = File_floder() floder.add(path_p=path) csv = Csv_base() csv.write_csv_file_line( file_path=path + "/data.csv", mode='w+', str=['网站名', '网址', '标题', '新闻内容', '发布时间', '页码', '采集时间']) # 爬虫 start_url = "http://military.people.com.cn/GB/1077/index%d.html" for i in range(1, 8): url = start_url % (i) #print(url) list_html = htmlSource.get_html(url_p=url, type_p='rg') #print(list_html) colum = [('a', '//div[@class="ej_list_box clear"]//ul//li//a//@href', 'sab', 'http://military.people.com.cn')] list = rule.html_content_analysis_detial(html_text=list_html, column=colum, url=url) #print(list) for a in list[0][1]: read_detial(a, i)
#!/usr/bin/env python # coding=utf-8 from common.inc_csv import Csv_base from common.inc_file import File_file from lxml import html import re csv = Csv_base() file = File_file() def replaceStr(a): print(a) a = re.sub(re.compile(r"收藏查看我的收藏(\d+)有用(.*?)(\d+)已投票(\d+)", re.S), "", a) a = str(a).replace("编辑锁定", " ").strip() a = str(a).replace("讨论999", " ").strip() a = str(a).replace("本词条缺少概述图,补充相关内容使词条更完整,还能快速升级,赶紧来编辑吧!", " ").strip() a = str(a).replace( "百度百科内容由网友共同编辑,如您发现自己的词条内容不准确或不完善,欢迎使用本人词条编辑服务(免费)参与修正。", " ").strip() a = str(a).replace("立即前往 >>", " ").strip() print(a) return a if __name__ == '__main__': csv_data_path = "../../data/百科候选关键词.csv" rows = csv.read_csv_file(csv_data_path) for row in rows: try:
# -- coding: UTF-8 -- from common.inc_csv import Csv_base from common.inc_file import File_floder import requests if __name__ == '__main__': file_path = '../data/百科候选关键词.csv' folder_path ="../data/百科候选关键词/img" floder = File_floder() floder.add(folder_path) file = Csv_base() list = file.read_csv_file(file_path) for row in list: try: img_url = str(row[4]).replace('`','') if(img_url!=''): img_name=img_url.split("/")[-1] if(img_name.find("?")>-1): img_name = img_name[0:img_name.find("?")] #img_content = requests.get(img_url).content #with open('../data/百科候选关键词/img/%s.jpg' % img_name, 'wb') as f: # f.write(img_content) rw_str=[img_url,img_name,row[11].replace('`','')] file.write_csv_file_line(file_path="../data/百科候选关键词_img.csv",str=rw_str) except Exception as e: print(e)
# -- coding: UTF-8 -- from common.HtmlSource import HtmlSource from common.Rule import Rule #from common.inc_conn import Conn_mysql from common.inc_csv import Csv_base from common.inc_file import File_file, File_floder import requests from lxml import html import time floder = File_floder() htmlSource = HtmlSource() rule = Rule() csv = Csv_base() flag = 0 commontitle = 1 # 多线程 def read_detial(url, path): if (str(url[0][1]).startswith("http") and url[1][1] != "收起"): # TODO 翻页 for i in range(1, 11): print(url[0][1]) detial_html = htmlSource.get_html(url_p=url[0][1] + "/?page=%d" % i, type_p='rg') tree = html.fromstring(detial_html) hreflist = tree.xpath( '//ul[@class="list"]/li/div/div/p[@class="name"]/a/@href')
#!/usr/bin/env python # coding=utf-8 from common.inc_csv import Csv_base from lxml import html import re def readFile(filePath=''): pass if __name__ == '__main__': filepath = '../data/问答语料_1.0.txt' file = Csv_base() list = file.read_csv_file(filepath) for i in range(len(list)): if (i > 3): row = list[i] rows = str(row[0]).split("\t") html_text = rows[1].replace("[", '').replace("]", "") # 正则匹配 re.match从字符串起始处匹配。 html_text = re.sub(re.compile(r"<script.*?</script>", re.S), "", html_text) print(html_text) tree = html.fromstring(html_text) texts = tree.xpath('.//text()') text = "" for a in texts: text = text + str(a).replace("\\n", ".").strip()