Пример #1
0
	def __init__(self):
		self.request=None
		self.soup=None
		self.items=[]
		#new add to check and remove not good result
		self.pdfcheck=PDFdoiCheck()
Пример #2
0
class BaiduXueshu(object):
	host="http://xueshu.baidu.com"
	path="/s"
	url="http://xueshu.baidu.com/s"
	word="wd"
	citeurl="http://xueshu.baidu.com/u/citation"
	def __init__(self):
		self.request=None
		self.soup=None
		self.items=[]
		#new add to check and remove not good result
		self.pdfcheck=PDFdoiCheck()
	def reset(self):
		self.request=None
		del self.items[:]
		del self.soup; self.soup=None
		del self.request; self.request=None
		self.pdfcheck.reset('')
		
	def search(self,keyword,params={},headers={},proxy=None):
		self.reset()
		if (not keyword):return

		params[self.word]=keyword
		params['sc_hit']='1'#for find all, not exactly
		if proxy:
			if not isinstance(proxy,dict):
				proxy=None
		try:
			if (proxy):
				r=requests.get(self.url,params=params,headers=headers,proxies=proxy,timeout=timeout_setting)
			else:
				r=requests.get(self.url,params=params,headers=headers,timeout=timeout_setting)
			if r.status_code is 200:
				if ('<img src="http://verify.baidu.com/cgi-bin/genimg' in r.text):
					time.sleep(600)
					self.search(keyword,params=params,headers=headers)
				try:
					self.soup=BeautifulSoup(r.text, "html.parser")
					self.items=self.soup.findChildren('div',attrs={'class':'result sc_default_result xpath-log'})
				except Exception as e:
					print e,'when parsing searching result'
					return 
				#print "Find",len(self.items)," Results."
				#for item in items:
		except Exception as e:
			print "Error when searching word.."
			time.sleep(20)
			self.search(keyword=keyword,params=params,headers=headers,proxy=proxy)

	def _parsepdflink(self,link):
		'''Some pdf link in baidu format'''
		if (link):
			link=requests.utils.unquote(link)
		if (len(link)>4):
			if link[:2]=="/s":
				rer=re.search(r'(?<=url=)http.*?(?=\&ie;=utf-8)',link)
				if rer:
					link=rer.group()
					return link
			elif(link[:4] == 'http'):
				return link
			return ''
		return ""

	def getpdflink(self,num=0):
		pdfs=[ i.text for i in self.items[num].findChildren('p',attrs={'class':"saveurl"})] \
			+[ i['href'] for i in self.items[num].findChildren('a',attrs={'class':"sc_download c-icon-download-hover"})]
		pdfs=list(set([ adjustpdflink(self._parsepdflink(pdf)) for pdf in pdfs]))
		if '' in pdfs: pdfs.remove('')
		if (pdfs): print "Get",len(pdfs)," links for record ",num,":",#,str(pdfs)
		return pdfs

	def getcite(self,num=0,citetype="txt"):
		cite=self.items[num].findChild('a',attrs={'class':'sc_q c-icon-shape-hover'})
		try:
			params={'t':citetype,'url':cite['data-link'],'sign':cite['data-sign']}
			r=requests.get(self.citeurl,params=params,timeout=timeout_setting)
			if r.status_code is 200:
				return r.text
		except:
			print "Can't get citation"
		return ""

	def getdoi(self,num=0):
		'''Get DOI from Baidu Cite'''
		soup=BeautifulSoup(self.getcite(num,citetype='txt'),"html.parser")
		if (soup.doi): 
			doi=soup.doi.text
		elif(soup.primarytitle):
			cr=CRrecord()
			cr.getfromtitle(soup.primarytitle.info.text,ignorecheminfo=True)
			doi=cr.doi
		else:
			doi=DOI("")
		return DOI(doi[doi.find('10.'):])

	def getallpdf(self,doifilter=None,onlinecheck=True,savestate=None,usebdcheck=True):
		'''Get All pdf from link
		doifilter should be a function, return True when DOI ok'''
		usedoifilter=callable(doifilter)
		getallfilelist=[]
		if isinstance(savestate,(list,tuple,set)):
			savestate=set(savestate)
		elif (isinstance(savestate,int)):
			savestate=set([savestate])
		else:
			savestate=set([0,1,2,3])
		bdcheck=BDCheck()
		for i in range(len(self.items)):
			try:
				getfilelist=[]
				# Get PDF links
				links=self.getpdflink(i)
				if (links):
					doi=DOI(self.getdoi(i))
					if not doi:
						print "blank doi..",doi
						continue
					if ( usedoifilter and not doifilter(doi)):
						print doi,'Not fit filter..'
						continue
						
					# Check by bdcheck api
					if (usebdcheck):
						bdout=bdcheck.get(doi)
						if sum(bdout)>0:
							print doi, 'has search/oapdf/free',bdout
							continue
					oapdffree=bdcheck.setbycheck(doi)
					if (oapdffree[0] and oapdffree[1]):
						print doi,'exist in oapdf/free library..'
						continue						
					elif oapdffree[0]:
						print doi,'exist in oapdf library..'
						continue				
					elif oapdffree[1]:
						print doi,'exist in free library..'
						continue
					doifname=doi.quote()+".pdf"
					if (pdfexistpath(doifname)):
						print doi,'Files exist in current folder..'
						continue

					# Start to find pdf at each link
					print "### Find for result with DOI: "+doi
					foundDonePDF=False
					for link in links:
						print 'Link:',str(link),
						if (onlinecheck):
							print "Try Getting..",
							# Get a StringIO obj
							getpdfobj=getwebpdf(link,fname=doifname,params=getwebpdfparams(link),stringio=True)
							if (not getpdfobj):
								continue
							try:
								dpfresult=self.pdfcheck.checkonlinepdf(fobj=getpdfobj,doi=doi)
								sys.stdout.flush()
								if (dpfresult!=0):
									if ( savestate and (dpfresult in savestate)):
										#Important to set fname to None
										rmresult=self.pdfcheck.removegarbage(fname=None,notdelete=True)
										if (rmresult <= 1):
											getfilelist.append( (getpdfobj,self.pdfcheck.realdoi,dpfresult))
									else:
										print "Not OK PDF for doi",doi												
								else:
									foundDonePDF=True
									if (self.pdfcheck.savefobj2file(doi=self.pdfcheck.realdoi,state=0,fobj=getpdfobj)):
										print "!!!!!!! Get PDF file to Done!: "+self.pdfcheck.realdoi
										del getfilelist[:]	
										nowdoi=DOI(self.pdfcheck.realdoi)
										getallfilelist.append('Done/'+nowdoi.quote()+'.pdf')

										break
									else:
										print "What? should never happen for pdfdoicheck.savefobj2file Done.."
							except Exception as e:
								print e,'Error at baidu getallpdf(web) when doing pdfcheck',doi,link

						# Now should not use this method
						elif (getwebpdf(link,fname=doifname,params=getwebpdfparams(link))):
							print "Please don't use download pdf to disk, use check online!"
							print "Try Getting..",
							try:
								dpfresult=self.pdfcheck.renamecheck(doifname)
								sys.stdout.flush()
								if (dpfresult!=0): 
									if ( savestate and (dpfresult in savestate)):
										#Important to set fname to None		
										rmresult=self.pdfcheck.removegarbage(fname=None)
										if (rmresult <= 1):
											if (os.path.exists(self.pdfcheck._fname)):
												getfilelist.append((self.pdfcheck._fname, dpfresult))
											else:
												print "What? should never happen for pdfdoicheck.moveresult Not Done.."
										else:
											print "Has been removed.."
									else:
										if (os.path.exists(self.pdfcheck._fname)) : 
											os.remove(self.pdfcheck._fname)
								else:
									foundDonePDF=True
									if (os.path.exists(self.pdfcheck._fname)):
										print "!!!!!!! Get PDF file to Done!: "+doifname
										getfilelist.append(self.pdfcheck._fname)
										#time.sleep(random.randint(1,5))								
										break
									else:
										print "What? should never happen for pdfdoicheck.moveresult Done.."
							except Exception as e:
								if os.path.exists(doifname):
									if (not os.path.exists('tmpfail/'+doifname)):
										os.renames(doifname,'tmpfail/'+doifname)
									else:
										os.remove(doifname)
								print e,'Error at baidu getallpdf when doing pdfcheck'
						else:
							print "can't get at this link"

					bdcheck.set(doi)
					# Online Check but not Done
					if onlinecheck and not foundDonePDF and len(getfilelist)>0:
						minnum=-1
						minresult=999999
						for i in range(len(getfilelist)):
							if getfilelist[i][2]<minresult:
								minnum=i
						nowdoi=DOI(getfilelist[minnum][1])
						if (self.pdfcheck.savefobj2file(doi=nowdoi,state=getfilelist[minnum][2],fobj=getfilelist[minnum][0])):
							print "!!!!!!! Get PDF file to: "+self.pdfcheck.judgedirs.get(getfilelist[minnum][2],'.'),self.pdfcheck.realdoi
							getallfilelist.append(self.pdfcheck.judgedirs.get(getfilelist[minnum][2],'.')+os.sep+nowdoi.quote()+".pdf")
							del getfilelist[:]
			except Exception as e:
				print e, "##### Error when get pdf.."
		return getallfilelist

	def findwordPDF(self,keyword,doifilter=None):
		print "#########################################################################"
		print "## Now finding for: "+ keyword+"............"
		sys.stdout.flush()
		self.search(keyword=keyword)
		self.getallpdf(doifilter)		

	def findcrossreftitledoi(self,doi,printyn=True):
		'''Find doi by crossref first'''
		cr=CRrecord()
		if( cr.getfromdoi(doi,fullparse=False) and cr.doi):
			keyword=cr.title+" "+cr.doi
			print "#########################################################################"
			print "## Now finding for doi with title: "+ keyword.encode('utf-8')+"............"
			sys.stdout.flush()
			self.search(keyword=keyword)
			self.getallpdf()
		else:
			print "Error DOI!: "+doi
		cr.reset()

	def finddoiPDFfromFile(self,fname):
		'''Put doi in file and use it to find pdf'''
		fin=open(fname)
		countN=0
		for line in fin:
			ldoi=line.lower().strip()
			doi=DOI(ldoi)
			if (os.path.exists(doi.quote()+".pdf")):
				continue
			self.findcrossreftitledoi(ldoi)
			#time.sleep(random.randint(1,10))
			countN+=1
			if countN>=10:
				gc.collect()
				countN=0
		fin.close()			

	def findPDFbyISSN(self,issn,maxresult=None, step=100, offset=0, 
		usedoi=True,doifilter=None,onlinecheck=True,savestate=None,proxy=None,usebdcheck=True):
		'''Find PDF by ISSN based on search result from crossref'''
		# may be improve to not only issn..
		if (not issn):return
		if (len(issn)==9 and issn[4]=='-'):
			needurl="http://api.crossref.org/journals/"+issn+"/works"
		elif('10.' in issn):
			needurl="http://api.crossref.org/prefixes/"+issn+"/works"
		else:
			print "Error ISSN/prefix"
			sys.exit(1)
		cr=CRrecord()
		total=cr.gettotalresultfromlink(needurl)
		if (not maxresult or maxresult <=0 or maxresult>total): 
			maxresult=total
		params={"rows":str(step)}
		maxround=(maxresult-offset)/step+1
		offsetcount=offset
		bdcheck=BDCheck()

		for i in range(maxround):
			params["offset"]=str(step*i+offset)
			r=requests.get(needurl,params,timeout=timeout_setting_download)
			if (r.status_code is 200):
				# Get all check/in oapdf 
				if usebdcheck: 
					bdcheckall=bdcheck.filterdois(r.json(),oapdf=1,crjson=True)

				for j in r.json().get('message',{}).get('items',[]):
					keyword=j.get('title',[''])
					doi=DOI(j.get("DOI",""))
					if not doi:
						offsetcount+=1
						time.sleep(2)
						continue

					# Check whether in bdcheck
					if (usebdcheck and doi in bdcheckall):
						print doi, 'has search/oapdf/free by bdcheck'
						offsetcount+=1
						time.sleep(1)
						continue
						
					# If not in bdcheck, check oapdf/free and set it
					# TODO: remove it after combine oapdf information to library
					oapdffree=bdcheck.setbycheck(doi)
					if (oapdffree[0] or oapdffree[1]):
						print doi,'exist in oapdf/free library..'
						offsetcount+=1
						time.sleep(1)
						continue						

					if (keyword): 
						keyword=keyword[0]
					else:
						time.sleep(2)
						offsetcount+=1
						continue
					if usedoi:keyword+=" "+doi
					print "#####################################",offsetcount,"####################################"
					print "## Now finding for doi with title:"+doi+" "+ keyword.encode('utf-8')+"............"
					sys.stdout.flush()
					self.search(keyword.encode('utf-8'),proxy=proxy)
					bdresult=self.getallpdf(doifilter,onlinecheck=onlinecheck,savestate=savestate,usebdcheck=usebdcheck)
					bdcheck.set(doi)
					offsetcount+=1
			gc.collect()
		print "End of process for",issn