def read_detial(url, i): detial_html = htmlSource.get_html(url_p=url, type_p='rg') #print(detial_html) # 写html files = File_file() names = url.split('/') file_name = names[len(names) - 1] files.save_source(path=path, file=file_name, all_the_text=detial_html, encoding_='utf-8') colum = [ ('title', '//h1[@class="articleHead"]//text()', 'l'), ('pushDate', '//div[@class="info"]//span[@class="infoA"][@id="pubtime_baidu"]//text()', 'l'), ('content', '//div[@class="articleText"]//text()', 'sarra', ',') ] result = rule.html_content_analysis_detial(html_text=detial_html, column=colum, url=url) print(result) #sql="insert into cancer value('%s','%s','%s','%s','%s')"%(result[0][1][0],str(result[1][1][0]).replace('患者,图片因隐私问题无法显示','').replace("患者,","患者:").replace("医生,","医生:").replace('\'','"'),type,'春雨医生',url) #print(sql) # 写文件 # web_name(网站名)、web_url(网址)、titile(标题)、text(新闻内容)、publish_date(发布时间) csv = Csv_base() csv.write_csv_file_line(file_path=path + "/data.csv", str=[ '参考消息', url, result[0][1], result[1][1], result[2][1], i, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) ])
def saveConf(self): # TODO 上传到服务器、保存数据到数据库提供监控查询点 # 弹窗输入配置的名称,配置执行控制的条件、定时配置等等信息录入到数据库 filename = "testxuexi111.json" # 保存配置 file = File_file() file.save_source(path="./configs", file=filename, all_the_text=str(self.conf))
def crawlerDetail(self, confs, url=''): rule = Rule() result = rule.crawler_detail(confs=confs, url=url) file = File_file() # 写入数据 print(result) file.save_source(path='../data/', file='xuexi111Detail.json', all_the_text=str(result) + '\n')
def read_detial(url, i): detial_html = htmlSource.get_html(url_p=url, type_p='rg') #print(detial_html) # 写html files = File_file() file_name = "%d.json" % i files.save_source(path=path, file=file_name, all_the_text=detial_html, encoding_='utf-8')
# -- coding: UTF-8 -- import requests from lxml import etree from common.inc_file import File_file file = File_file() urlx = 'https://jieqi.supfree.net/cntv.asp?n=' session = requests.Session() for year in range(833, 5001): dicall = {} url = urlx + str(year) resp = session.get(url=url) resp.encoding = 'gb2312' rep = etree.HTML(resp.text) a = rep.xpath('//table/tr/td/a/text()') b = rep.xpath('//table/tr/td/text()') dicall[str(year)] = {} for i in range(len(a)): if len(b) > i: dicall[str(year)][a[i]] = str(b[i]).strip()[:-9] file.save_source(path="./", file="jieqi.json", all_the_text=str(dicall) + "\n")