def getPapers(url,spath,i,geo,total,fcreate=False): html=getHtml(url,0) burl=[] reg='<a href="javascript:;" onclick="lsdy1'+'(.*?)'+''','http://www.progeophys.cn''' downno=re.findall(reg,html) downurl=[] for j in downno: tmp=j.split(',') tmp=tmp[1] downurl.append(down_url+tmp[1:-1]) reg='<a target="_blank".*?>'+'(.*?)'+'</a>' biaoti=re.findall(reg,html) reg='<div class="zuozhe">'+'(.*?)'+'</div>' zuozhe=re.findall(reg,html) html=etree.HTML(html) chuban=html.xpath('//div[@class="chuban"]/span/text()') # biaoti=html.xpath('//li/div[@class="biaoti"]/a/text()') #存在特殊字符情况 # zuozhe=html.xpath('//li/div[@class="zuozhe"]/text()') #存在空值情况 jianjie=html.xpath('//li/div[@class="jianjie"]/text()') downcount=html.xpath('//li/div/span/b/text()') tmp=chuban[1].split(' ') tmp=tmp[1] # 出版日期 geo, fcreate=chkData_geo(geo, fcreate) geo, total=getPaper_geo(geo, spath+'\\'+save_name, spath, burl, biaoti, total, i, tmp, zuozhe, jianjie, downurl, downcount) return geo, fcreate, total
def handle(): name=request.form.get('name') studentNum=request.form.get('studentNum') organization=request.form.get('organization') package=getRecords.getRecords(name) if(package==-1): return 'nothing' # print package package['name']=name package['studentNum']=studentNum package['organization']=organization result=getHtml.getHtml(package) outFile=str(studentNum)+".pdf" # with open(outFile,'w'): # print outFile options = { 'margin-top': '1.0in', 'margin-right': '0.75in', 'margin-bottom': '0.75in', 'margin-left': '1.0in', 'page-size': 'A4', 'encoding': "UTF-8", 'custom-header': [ ('Accept-Encoding', 'gzip') ] } pdfkit.from_string(result,outFile,options=options) response = make_response(send_file(outFile)) response.headers["Content-Disposition"] = "attachment; filename="+outFile+";" return response
def main(self): import getHtml result = self.get_status(self.result_path,'.yaml') lst = [] for case_name, confirm_status in result.items(): if not case_name.startswith('log'): case_name = str(case_name).split('.')[0] case_result = self.open_yaml(confirm_status) case_img = self.confirm_file(str(confirm_status).replace('status','img').replace('yaml','png')) case_log = self.confirm_file(str(confirm_status).replace('status', 'log').replace('yaml', 'log')) lst.append( getHtml.get_tr(case_name,case_result,case_img,case_log) ) getHtml.getHtml(''.join(lst), self.status(), self.result_path )
def getPapers_geo(url): html=getHtml(url,0) soup=BeautifulSoup(html,'lxml') tmp=soup.findAll(["td"]) papers=[] for ii in tmp: j=str(ii) if j.find("href")>=0: papers.append(j[j.find("volumn_")+7:j.find("shtml")-1]) return papers
def getPapers_ogp(url): html=getHtml(url,0) soup=BeautifulSoup(html,'lxml') tmp=soup.findAll(["a"]) papers=[] for ii in tmp: j=str(ii) if j.find("J_WenZhang")>=0: soup1=BeautifulSoup(j,'lxml') jj=str(soup1.a.string) if (not jj.isspace()) and (jj!="None"): papers.append(j[j.find("volumn_")+len('volumn_'):j.find(".shtml")]) return papers
def getURL(): result = getHtml() soup = BeautifulSoup(result) file = [] title = [] for link in soup.find_all('a'): target = link.get('data-url') gettitle = link.get('data-title') if gettitle: if (str(gettitle) not in ['教材内容', '评价考核', '教学设计', '重点难点']): title.append(gettitle) a = re.findall(r'//office', str(target)) if not a: if target: file.append(target) return title, file
# -*- coding:utf-8 -*- from getHtml import getHtml import re import xlwt import chardet # 01.获取51job网页内容 url = "https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=4&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=" html = getHtml(url) code = chardet.detect(html)['encoding'] html = html.decode(code).encode('utf-8') # 02.设置正则 reg = re.compile( r'class="t1 ">.*? <a target="_blank" title="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*? <span class="t5">(.*?)</span>', re.S) #03. 获取结果 result = re.findall(reg, html) print len(result) # 04. 打开workbook book = xlwt.Workbook(encoding='utf-8') #05.创建工作表 sheet = book.add_sheet('python职位') # 06.存入第一行 col = ('职位名', '公司名', '工作地点', '薪资', '发布时间') for i in range(len(col)): sheet.write(0, i, col[i]) for i in range(len(result)): for j in range(len(result[i])):
def getBS(url, id=None): #创建BS对象 soup = BeautifulSoup(getHtml(url, id), features="html.parser") return soup
def getPapers(url, spath, i, geo, total, fcreate=False): html = getHtml(url, 0) soup = BeautifulSoup(html, 'lxml') tmp = soup.findAll(["a"]) ref = [] # biaoti=[] # zuozhe=[] downurl = [] downurlP = [] burl = [] flag = 0 for ii in tmp: # 参考文献 j = str(ii) if j.find("onclick") >= 0: ref.append(j[j.find("('") + 2:j.find("')") - 4] + ';') # biaoti.append(j[j.find(".")+2:j.find("[J]")]) # zuozhe.append(j[j.find("('")+2:j.find(".")]) if j.find("attachType=PDF") >= 0: downurlP.append(down_url + j[j.find("href=") + 8:j.find("<u>PDF") - 2].replace('amp;', '')) if j.find("J_VM") >= 0: flag = flag + 1 if flag == 2: if j.find("attachType=HTML") < 0: burl.append(len(downurl)) downurl.append(down_url + ( j[j.find("href=") + 8:j.find("<u>HTML") - 2].replace('amp;', '').replace('HTML', 'PDF'))) # else: # downurlP.index(biaoti) flag = 0 tmp = soup.findAll(["b"]) biaoti = [] for ii in tmp: # 标题 j = str(ii) biaoti.append(j[j.find("<b>") + 3:j.find("</b>")]) if ('' in biaoti): # 空标题 tmp = soup.findAll(["p"]) # for ii in tmp: # j=str(ii) j = str(tmp[1]) biaoti[biaoti.index('')] = j[j.find("Verdana") + 9:j.find("</font>")] tmp = soup.findAll(["font"]) down_htm = [] down_pdf = [] flag = 0 for ii in tmp: # 下载次数 j = str(ii.string).strip() if j.isdigit(): flag = flag + 1 if flag == 1: down_htm.append(j) elif flag == 2: down_pdf.append(j) flag = 0 tmp = soup.findAll(["td"]) zuozhe = [] jianjie = [] flag = 0 for ii in tmp: # 摘要 soup1 = BeautifulSoup(str(ii), 'lxml') j = str(soup1.td.string) if j != "None": if j.isdigit() or flag > 0: flag = flag + 1 if j.isdigit() and flag == 3: #获取失败 flag = 1 jianjie.append('') elif j.isdigit() and flag == 2: #获取失败 flag = 1 jianjie.append('') zuozhe.append('') elif flag == 3: flag = 0 jianjie.append(j) elif flag == 2: zuozhe.append(j) for ii in range(len(jianjie), len(biaoti)): jianjie.append('') tmp = soup.findAll(["strong"]) tmp = str(tmp[0]).split(':')[1][0:-9] # 出版日期 geo, fcreate = chkData(geo, fcreate) geo, total = getPaper(geo, spath + '\\' + save_name, spath, burl, biaoti, total, i, tmp, zuozhe, jianjie, downurl, down_htm, down_pdf, ref) return geo, fcreate, total