示例#1
0
def getPapers(url,spath,i,geo,total,fcreate=False):
    html=getHtml(url,0)
    
    burl=[]

    reg='<a href="javascript:;" onclick="lsdy1'+'(.*?)'+''','http://www.progeophys.cn'''
    downno=re.findall(reg,html)
    downurl=[]
    for j in downno:
        tmp=j.split(',')
        tmp=tmp[1]
        downurl.append(down_url+tmp[1:-1])
    reg='<a target="_blank".*?>'+'(.*?)'+'</a>'
    biaoti=re.findall(reg,html)
    reg='<div class="zuozhe">'+'(.*?)'+'</div>'
    zuozhe=re.findall(reg,html)
    
    html=etree.HTML(html)
    chuban=html.xpath('//div[@class="chuban"]/span/text()')
#    biaoti=html.xpath('//li/div[@class="biaoti"]/a/text()') #存在特殊字符情况
#    zuozhe=html.xpath('//li/div[@class="zuozhe"]/text()') #存在空值情况
    jianjie=html.xpath('//li/div[@class="jianjie"]/text()')
    downcount=html.xpath('//li/div/span/b/text()')
    
    tmp=chuban[1].split(' ')
    tmp=tmp[1] # 出版日期
    
    geo, fcreate=chkData_geo(geo, fcreate)  
    geo, total=getPaper_geo(geo, spath+'\\'+save_name, spath, burl, biaoti, total, i, tmp, zuozhe, jianjie, downurl, downcount)
    
    return geo, fcreate, total
示例#2
0
def handle():	
	name=request.form.get('name')
	studentNum=request.form.get('studentNum')		
	organization=request.form.get('organization')
	package=getRecords.getRecords(name)	
	if(package==-1):
		return 'nothing'
	# print package	
	package['name']=name
	package['studentNum']=studentNum
	package['organization']=organization
	result=getHtml.getHtml(package)
	outFile=str(studentNum)+".pdf"
	# with open(outFile,'w'):
	# print outFile
	options = {
		'margin-top': '1.0in',
		'margin-right': '0.75in',
	    'margin-bottom': '0.75in',
	    'margin-left': '1.0in',
        'page-size': 'A4',
        'encoding': "UTF-8",
        'custom-header': [
            ('Accept-Encoding', 'gzip')
        ]
    }
	pdfkit.from_string(result,outFile,options=options)
	response = make_response(send_file(outFile))
	response.headers["Content-Disposition"] = "attachment; filename="+outFile+";"
	return response
示例#3
0
 def main(self):
     import getHtml
     result = self.get_status(self.result_path,'.yaml')
     lst = []
     for case_name, confirm_status in result.items():
         if not case_name.startswith('log'):
             case_name = str(case_name).split('.')[0]
             case_result = self.open_yaml(confirm_status)
             case_img = self.confirm_file(str(confirm_status).replace('status','img').replace('yaml','png'))
             case_log = self.confirm_file(str(confirm_status).replace('status', 'log').replace('yaml', 'log'))
             lst.append(
                 getHtml.get_tr(case_name,case_result,case_img,case_log)
             )
     getHtml.getHtml(''.join(lst),
                     self.status(),
                     self.result_path
                     )
示例#4
0
def getPapers_geo(url):
    html=getHtml(url,0)
    soup=BeautifulSoup(html,'lxml')
    
    tmp=soup.findAll(["td"])
    papers=[]
    for ii in tmp:
        j=str(ii)
        if j.find("href")>=0:
            papers.append(j[j.find("volumn_")+7:j.find("shtml")-1])
    
    return papers
示例#5
0
def getPapers_ogp(url):
    html=getHtml(url,0)
    soup=BeautifulSoup(html,'lxml')
    
    tmp=soup.findAll(["a"])
    papers=[]
    for ii in tmp:
        j=str(ii)
        if j.find("J_WenZhang")>=0:
            soup1=BeautifulSoup(j,'lxml')
            jj=str(soup1.a.string)
            if (not jj.isspace()) and (jj!="None"):
                papers.append(j[j.find("volumn_")+len('volumn_'):j.find(".shtml")])
    
    return papers
def getURL():
    result = getHtml()
    soup = BeautifulSoup(result)

    file = []
    title = []
    for link in soup.find_all('a'):
        target = link.get('data-url')
        gettitle = link.get('data-title')
        if gettitle:
            if (str(gettitle) not in ['教材内容', '评价考核', '教学设计', '重点难点']):
                title.append(gettitle)
        a = re.findall(r'//office', str(target))
        if not a:
            if target:
                file.append(target)
    return title, file
示例#7
0
# -*- coding:utf-8 -*-
from getHtml import getHtml
import re
import xlwt
import chardet
# 01.获取51job网页内容
url = "https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=4&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
html = getHtml(url)
code = chardet.detect(html)['encoding']

html = html.decode(code).encode('utf-8')

# 02.设置正则
reg = re.compile(
    r'class="t1 ">.*? <a target="_blank" title="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*? <span class="t5">(.*?)</span>',
    re.S)

#03. 获取结果
result = re.findall(reg, html)
print len(result)

# 04. 打开workbook
book = xlwt.Workbook(encoding='utf-8')
#05.创建工作表
sheet = book.add_sheet('python职位')
# 06.存入第一行
col = ('职位名', '公司名', '工作地点', '薪资', '发布时间')
for i in range(len(col)):
    sheet.write(0, i, col[i])
for i in range(len(result)):
    for j in range(len(result[i])):
示例#8
0
def getBS(url, id=None):
    #创建BS对象
    soup = BeautifulSoup(getHtml(url, id), features="html.parser")
    return soup
示例#9
0
文件: ogp.py 项目: lls-git/python
def getPapers(url, spath, i, geo, total, fcreate=False):
    html = getHtml(url, 0)

    soup = BeautifulSoup(html, 'lxml')

    tmp = soup.findAll(["a"])
    ref = []
    #    biaoti=[]
    #    zuozhe=[]
    downurl = []
    downurlP = []
    burl = []
    flag = 0
    for ii in tmp:  # 参考文献
        j = str(ii)
        if j.find("onclick") >= 0:
            ref.append(j[j.find("('") + 2:j.find("')") - 4] + ';')


#            biaoti.append(j[j.find(".")+2:j.find("[J]")])
#            zuozhe.append(j[j.find("('")+2:j.find(".")])

        if j.find("attachType=PDF") >= 0:
            downurlP.append(down_url + j[j.find("href=") + 8:j.find("<u>PDF") -
                                         2].replace('amp;', ''))
        if j.find("J_VM") >= 0:
            flag = flag + 1
            if flag == 2:
                if j.find("attachType=HTML") < 0:
                    burl.append(len(downurl))
                downurl.append(down_url + (
                    j[j.find("href=") + 8:j.find("<u>HTML") -
                      2].replace('amp;', '').replace('HTML', 'PDF')))
                #                else:
                #                    downurlP.index(biaoti)
                flag = 0

    tmp = soup.findAll(["b"])
    biaoti = []
    for ii in tmp:  # 标题
        j = str(ii)
        biaoti.append(j[j.find("<b>") + 3:j.find("</b>")])
    if ('' in biaoti):  # 空标题
        tmp = soup.findAll(["p"])
        #        for ii in tmp:
        #            j=str(ii)
        j = str(tmp[1])
        biaoti[biaoti.index('')] = j[j.find("Verdana") + 9:j.find("</font>")]

    tmp = soup.findAll(["font"])
    down_htm = []
    down_pdf = []
    flag = 0
    for ii in tmp:  # 下载次数
        j = str(ii.string).strip()
        if j.isdigit():
            flag = flag + 1
            if flag == 1:
                down_htm.append(j)
            elif flag == 2:
                down_pdf.append(j)
                flag = 0

    tmp = soup.findAll(["td"])
    zuozhe = []
    jianjie = []
    flag = 0
    for ii in tmp:  # 摘要
        soup1 = BeautifulSoup(str(ii), 'lxml')
        j = str(soup1.td.string)
        if j != "None":
            if j.isdigit() or flag > 0:
                flag = flag + 1
                if j.isdigit() and flag == 3:  #获取失败
                    flag = 1
                    jianjie.append('')
                elif j.isdigit() and flag == 2:  #获取失败
                    flag = 1
                    jianjie.append('')
                    zuozhe.append('')
                elif flag == 3:
                    flag = 0
                    jianjie.append(j)
                elif flag == 2:
                    zuozhe.append(j)
    for ii in range(len(jianjie), len(biaoti)):
        jianjie.append('')

    tmp = soup.findAll(["strong"])
    tmp = str(tmp[0]).split(':')[1][0:-9]  # 出版日期

    geo, fcreate = chkData(geo, fcreate)
    geo, total = getPaper(geo, spath + '\\' + save_name, spath, burl, biaoti,
                          total, i, tmp, zuozhe, jianjie, downurl, down_htm,
                          down_pdf, ref)

    return geo, fcreate, total