def getdoi(self,num=0): '''Get DOI from Baidu Cite''' soup=BeautifulSoup(self.getcite(num,citetype='txt'),"html.parser") if (soup.doi): doi=soup.doi.text elif(soup.primarytitle): cr=CRrecord() cr.getfromtitle(soup.primarytitle.info.text,ignorecheminfo=True) doi=cr.doi else: doi=DOI("") return DOI(doi[doi.find('10.'):])
def recursivedoicheck(self,excludedoi,olddoi,wtitle=0.65,cutoff=0.85,justcheck=False): tryjudge=4 trydoi="" rightdoi=[] excludedoi.add(olddoi) for doi in self.doi-excludedoi: print "Recursive check doi..",self._fname,doi, judgenum = self.renamecheck(self._fname,wtitle=wtitle,cutoff=cutoff,\ justcheck=True,resetfile=False,excludedoi=excludedoi,fdoi=doi) excludedoi.add(doi) if (judgenum is 0): rightdoi.append(doi) tryjudge=0 elif (judgenum<tryjudge): trydoi=doi tryjudge=judgenum # else, retain 4 and blank doi if (len(rightdoi) is 1): doi=DOI(rightdoi[0]) self.realdoi=doi if not justcheck: self.moveresult(0,printstr=None,newfname=doi.quote()+".pdf") return 0 elif (len(rightdoi) >= 2 ): if not justcheck: self.moveresult(3,printstr="Many DOIs are OK, can't distinguish...(Unsure)") return 3 # Unsure else: print "Doesn't have reliable doi", self._fname if not justcheck: self.moveresult(tryjudge,printstr=None) return tryjudge
def savefobj2file(self,fname="",doi="",state=None,fobj=None): '''Save the current file obj(file/StringIO) to a file And also set the self.fname''' if (not fname and not doi): print "File name or doi is not given!" return if (doi and not fname): doi=DOI(doi) fname=doi.quote()+'.pdf' if (state is not None): outdir=self.judgedirs.get(state,'.') if not os.path.exists(outdir):os.makedirs(outdir) fname=outdir+os.sep+fname if not fobj: fobj=self.fobj if (fname and fobj and not fobj.closed): fobj.seek(0) if (not os.path.exists(fname)): f=open(fname,'wb') f.write(fobj.read()) f.close() fobj.seek(0) self._fname=fname return True else: print "File has exist...." return False
def setbycheck(self, doi): '''Update the bdcheck/oapdf/free in library based on check oapdf/free Can't set the record to "bdcheck" state return the [oapdf,free]''' try: if (isinstance(doi, str)): doi = DOI(doi) if (doi): oapdffree = doi.freedownload(outtuple=True) if (oapdffree[0] and oapdffree[1]): r = requests.get(self.url + "&doi=" + doi + "&update=True&oapdf=True&free=True", timeout=TIMEOUT_SETTING) elif oapdffree[0]: r = requests.get(self.url + "&doi=" + doi + "&update=True&oapdf=True", timeout=TIMEOUT_SETTING) elif oapdffree[1]: r = requests.get(self.url + "&doi=" + doi + "&update=True&free=True", timeout=TIMEOUT_SETTING) return oapdffree return [False, False] except Exception as e: print e, "SF BDCheck SetByCheck Fail.." return [False, False]
def finddoi(self, num, prefix='', issn=''): title = self.gettitle(num) doi = DOI(self.getdoi(num)) if (not prefix): prefix = doi.split('/', 1)[0] if doi else "" volume = self.getvolume(num) journal = self.getjournalfull(num) year = self.getyear(num) pages = self.getpages(num) self.cr = CRrecord() try: # The origin doi maybe true. Find in crossref if (doi and self.cr.getfromdoi(doi, fullparse=False) and self.cr.doi): # Further check title if (strdiff(doi,self.cr.doi)>=0.85 and \ strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75): return doi if (volume and pages): ops = pages.split('-') crps = self.cr.pages.split('-') if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0] and volume == self.cr.volume): return doi if (year and pages): ops = pages.split('-') crps = self.cr.pages.split('-') if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0] and year == self.cr.year): return doi print "Origin DOI:", doi, "may be true but record strange..Try title" keyword = title + " " + journal + " " + year + " " + pages + " " + volume if (self.cr.getfromtitledoi(keyword, doi, year=year, limit=10, fullparse=False, prefix=prefix)): if (doi): if (prefix == self.cr.doi.split('/')[0] and strdiff(doi, self.cr.doi) >= 0.85): return self.cr.doi else: print "Error for origin doi: " + doi + "; found: " + self.cr.doi return "" return self.cr.doi if (doi): if (strdiff(doi, self.cr.doi) >= 0.85): return self.cr.doi else: print "Error2 for origin doi: " + doi + "; found: " + self.cr.doi return "" else: return "" except Exception as e: print "Error when find doi..", e, "\nRetry..." return self.finddoi(num, prefix=prefix, issn=issn)
def set(self, doi, oapdf=None, free=None): '''Update the bdcheck even oapdf/free in library If give a list of doi, just post them. No return.''' try: if (isinstance(doi, str)): doi = DOI(doi) if (doi): if (oapdf and free): r = requests.get(self.url + "&doi=" + doi + "&update=True&oapdf=True&free=True", timeout=TIMEOUT_SETTING) elif oapdf: r = requests.get(self.url + "&doi=" + doi + "&update=True&oapdf=True", timeout=TIMEOUT_SETTING) elif free: r = requests.get(self.url + "&doi=" + doi + "&update=True&free=True", timeout=TIMEOUT_SETTING) else: r = requests.get(self.url + "&doi=" + doi + "&update=True", timeout=TIMEOUT_SETTING) elif (isinstance(doi, (list, tuple, set))): dois = list(doi) length = len(dois) maxround = length / 100 + 1 if length % 100 != 0 else length / 100 for i in range(0, maxround): if ((i + 1) * 100 >= len(dois)): doisjs = json.dumps(dois[i * 100:]) else: doisjs = json.dumps(dois[i * 100:(i + 1) * 100]) param = {'dois': doisjs} if (oapdf and free): r = requests.post(self.url + "&update=True&oapdf=True&free=True", params=param, timeout=TIMEOUT_SETTING) elif oapdf: r = requests.post(self.url + "&update=True&oapdf=True", params=param, timeout=TIMEOUT_SETTING) elif free: r = requests.post(self.url + "&update=True&free=True", params=param, timeout=TIMEOUT_SETTING) else: r = requests.post(self.url + "&update=True", params=param, timeout=TIMEOUT_SETTING) time.sleep(1) except Exception as e: print e, "SF BDCheck Set Fail.."
def finddoiPDFfromFile(self,fname): '''Put doi in file and use it to find pdf''' fin=open(fname) countN=0 for line in fin: ldoi=line.lower().strip() doi=DOI(ldoi) if (os.path.exists(doi.quote()+".pdf")): continue self.findcrossreftitledoi(ldoi) #time.sleep(random.randint(1,10)) countN+=1 if countN>=10: gc.collect() countN=0 fin.close()
def get(self, doi): '''Get list whether [bdcheck,oapdf,free] for single doi Return a dict for multi dois''' try: if (isinstance(doi, str)): doi = DOI(doi) if (doi): r = requests.get(self.url + "&doi=" + doi + "&select=True", timeout=TIMEOUT_SETTING) if r.status_code == 200: return r.json().get(doi, []) return [0, 0, 0] # if dois in list/tuple/set,return {doi:[0,0,0],...} elif (isinstance(doi, (list, tuple, set))): dois = list(doi) result = {} length = len(dois) maxround = length / 100 + 1 if length % 100 != 0 else length / 100 for i in range(0, maxround): if ((i + 1) * 100 >= len(dois)): doisjs = json.dumps(dois[i * 100:]) else: doisjs = json.dumps(dois[i * 100:(i + 1) * 100]) param = {'dois': doisjs} r = requests.post(self.url + "&select=True", params=param, timeout=TIMEOUT_SETTING) if r.status_code == 200: result.update(r.json()) time.sleep(1) return result return [0, 0, 0] except Exception as e: print e, "SF BDCheck Get Fail.." if (isinstance(doi, (list, tuple, set))): return {} return [0, 0, 0]
def process(self, fname="", cleannote=False, prefix='', issn='', start=0): epath = self.getpath() print "Output", self.length, "to", epath + os.sep + fname for i in range(start, self.length): try: #if (i%100 is 0): # print # print "Doing:",i+1, #else: # print i+1, pdfs = self.getpdf(i) urls = self.geturl(i) # Fast consider as record process before hasfound = False for pdf in pdfs: if "internal-pdf://OAPDF/" in pdf: hasfound = True doistr = self.gettag(i, "electronic-resource-num") if (doistr and len(doistr) > 4 and doistr[:4] == 'chk:'): doi = DOI(self.getdoi(i)) if doi: self.setdoi(i, "chk: " + doi) break if not hasfound: for url in urls: if "http://oapdf.sourceforge.net/cgi-bin/" in url: hasfound = True doistr = self.gettag(i, "electronic-resource-num") if (doistr and len(doistr) > 4 and doistr[:4] == 'chk:'): doi = DOI(self.getdoi(i)) if doi: self.setdoi(i, "chk: " + doi) break if hasfound: continue if (cleannote): self.cleannote(i) doistr = self.gettag(i, "electronic-resource-num") if (doistr and len(doistr) > 4 and doistr[:4] == 'chk:'): doi = DOI(self.getdoi(i)) else: doi = DOI(self.finddoi(i, prefix=prefix, issn=issn)) if doi: self.setdoi(i, "chk: " + doi) oapdflink = "" if (doi and doi.is_oapdf()): oapdflink = "http://oapdf.sourceforge.net/cgi-bin/doipage.cgi?doi=" + doi newpdfs = [] for pdf in pdfs: pdfpath = pdf.replace("internal-pdf://", epath + os.sep + "PDF" + os.sep) relpath = pdf.replace("internal-pdf://", "") # should never happen if (relpath == doi.quote() + ".pdf"): newpdfs.append(pdf) continue if (doi): if (os.path.exists(pdfpath)): try: os.renames( pdfpath, epath + os.sep + "PDF" + os.sep + doi.quote() + ".pdf") newpdfs.append("internal-pdf://" + doi.quote() + ".pdf") except: print "Can't rename:", pdf, 'to', doi.quote( ) + ".pdf" newpdfs.append(pdf) continue else: print "Maybe error for the record", doi, "with pdf path:", pdf, '; Try finding..', pdfdir = os.path.split(pdfpath)[0] if (os.path.exists(pdfdir)): fs = glob.glob(pdfdir + os.sep + '*.pdf') if (len(fs) == 1): try: os.renames( fs[0], epath + os.sep + "PDF" + os.sep + doi.quote() + ".pdf") newpdfs.append("internal-pdf://" + doi.quote() + ".pdf") print "Find", fs[0], 'and rename!' except: print "Can't rename:", fs[ 0], 'to', doi.quote() + ".pdf" newpdfs.append(pdf) continue else: print "Can't find.." newpdfs.append(pdf) continue else: newpdfs.append(pdf) continue else: print "Blank doi for file:", pdf newpdfs.append(pdf) continue if (oapdflink): newpdfs.append("internal-pdf://OAPDF/" + doi.quote() + ".pdf") self.setpdfs(i, newpdfs) # Set the urls if (oapdflink and oapdflink not in urls): self.addurl(i, oapdflink, first=True) except Exception as e: print "Error at ", i, 'since: ', e #return 1 if fname: self.write(fname) return 0
def findPDFbyISSN(self,issn,maxresult=None, step=100, offset=0, usedoi=True,doifilter=None,onlinecheck=True,savestate=None,proxy=None,usebdcheck=True): '''Find PDF by ISSN based on search result from crossref''' # may be improve to not only issn.. if (not issn):return if (len(issn)==9 and issn[4]=='-'): needurl="http://api.crossref.org/journals/"+issn+"/works" elif('10.' in issn): needurl="http://api.crossref.org/prefixes/"+issn+"/works" else: print "Error ISSN/prefix" sys.exit(1) cr=CRrecord() total=cr.gettotalresultfromlink(needurl) if (not maxresult or maxresult <=0 or maxresult>total): maxresult=total params={"rows":str(step)} maxround=(maxresult-offset)/step+1 offsetcount=offset bdcheck=BDCheck() for i in range(maxround): params["offset"]=str(step*i+offset) r=requests.get(needurl,params,timeout=timeout_setting_download) if (r.status_code is 200): # Get all check/in oapdf if usebdcheck: bdcheckall=bdcheck.filterdois(r.json(),oapdf=1,crjson=True) for j in r.json().get('message',{}).get('items',[]): keyword=j.get('title',['']) doi=DOI(j.get("DOI","")) if not doi: offsetcount+=1 time.sleep(2) continue # Check whether in bdcheck if (usebdcheck and doi in bdcheckall): print doi, 'has search/oapdf/free by bdcheck' offsetcount+=1 time.sleep(1) continue # If not in bdcheck, check oapdf/free and set it # TODO: remove it after combine oapdf information to library oapdffree=bdcheck.setbycheck(doi) if (oapdffree[0] or oapdffree[1]): print doi,'exist in oapdf/free library..' offsetcount+=1 time.sleep(1) continue if (keyword): keyword=keyword[0] else: time.sleep(2) offsetcount+=1 continue if usedoi:keyword+=" "+doi print "#####################################",offsetcount,"####################################" print "## Now finding for doi with title:"+doi+" "+ keyword.encode('utf-8')+"............" sys.stdout.flush() self.search(keyword.encode('utf-8'),proxy=proxy) bdresult=self.getallpdf(doifilter,onlinecheck=onlinecheck,savestate=savestate,usebdcheck=usebdcheck) bdcheck.set(doi) offsetcount+=1 gc.collect() print "End of process for",issn
def getallpdf(self,doifilter=None,onlinecheck=True,savestate=None,usebdcheck=True): '''Get All pdf from link doifilter should be a function, return True when DOI ok''' usedoifilter=callable(doifilter) getallfilelist=[] if isinstance(savestate,(list,tuple,set)): savestate=set(savestate) elif (isinstance(savestate,int)): savestate=set([savestate]) else: savestate=set([0,1,2,3]) bdcheck=BDCheck() for i in range(len(self.items)): try: getfilelist=[] # Get PDF links links=self.getpdflink(i) if (links): doi=DOI(self.getdoi(i)) if not doi: print "blank doi..",doi continue if ( usedoifilter and not doifilter(doi)): print doi,'Not fit filter..' continue # Check by bdcheck api if (usebdcheck): bdout=bdcheck.get(doi) if sum(bdout)>0: print doi, 'has search/oapdf/free',bdout continue oapdffree=bdcheck.setbycheck(doi) if (oapdffree[0] and oapdffree[1]): print doi,'exist in oapdf/free library..' continue elif oapdffree[0]: print doi,'exist in oapdf library..' continue elif oapdffree[1]: print doi,'exist in free library..' continue doifname=doi.quote()+".pdf" if (pdfexistpath(doifname)): print doi,'Files exist in current folder..' continue # Start to find pdf at each link print "### Find for result with DOI: "+doi foundDonePDF=False for link in links: print 'Link:',str(link), if (onlinecheck): print "Try Getting..", # Get a StringIO obj getpdfobj=getwebpdf(link,fname=doifname,params=getwebpdfparams(link),stringio=True) if (not getpdfobj): continue try: dpfresult=self.pdfcheck.checkonlinepdf(fobj=getpdfobj,doi=doi) sys.stdout.flush() if (dpfresult!=0): if ( savestate and (dpfresult in savestate)): #Important to set fname to None rmresult=self.pdfcheck.removegarbage(fname=None,notdelete=True) if (rmresult <= 1): getfilelist.append( (getpdfobj,self.pdfcheck.realdoi,dpfresult)) else: print "Not OK PDF for doi",doi else: foundDonePDF=True if (self.pdfcheck.savefobj2file(doi=self.pdfcheck.realdoi,state=0,fobj=getpdfobj)): print "!!!!!!! Get PDF file to Done!: "+self.pdfcheck.realdoi del getfilelist[:] nowdoi=DOI(self.pdfcheck.realdoi) getallfilelist.append('Done/'+nowdoi.quote()+'.pdf') break else: print "What? should never happen for pdfdoicheck.savefobj2file Done.." except Exception as e: print e,'Error at baidu getallpdf(web) when doing pdfcheck',doi,link # Now should not use this method elif (getwebpdf(link,fname=doifname,params=getwebpdfparams(link))): print "Please don't use download pdf to disk, use check online!" print "Try Getting..", try: dpfresult=self.pdfcheck.renamecheck(doifname) sys.stdout.flush() if (dpfresult!=0): if ( savestate and (dpfresult in savestate)): #Important to set fname to None rmresult=self.pdfcheck.removegarbage(fname=None) if (rmresult <= 1): if (os.path.exists(self.pdfcheck._fname)): getfilelist.append((self.pdfcheck._fname, dpfresult)) else: print "What? should never happen for pdfdoicheck.moveresult Not Done.." else: print "Has been removed.." else: if (os.path.exists(self.pdfcheck._fname)) : os.remove(self.pdfcheck._fname) else: foundDonePDF=True if (os.path.exists(self.pdfcheck._fname)): print "!!!!!!! Get PDF file to Done!: "+doifname getfilelist.append(self.pdfcheck._fname) #time.sleep(random.randint(1,5)) break else: print "What? should never happen for pdfdoicheck.moveresult Done.." except Exception as e: if os.path.exists(doifname): if (not os.path.exists('tmpfail/'+doifname)): os.renames(doifname,'tmpfail/'+doifname) else: os.remove(doifname) print e,'Error at baidu getallpdf when doing pdfcheck' else: print "can't get at this link" bdcheck.set(doi) # Online Check but not Done if onlinecheck and not foundDonePDF and len(getfilelist)>0: minnum=-1 minresult=999999 for i in range(len(getfilelist)): if getfilelist[i][2]<minresult: minnum=i nowdoi=DOI(getfilelist[minnum][1]) if (self.pdfcheck.savefobj2file(doi=nowdoi,state=getfilelist[minnum][2],fobj=getfilelist[minnum][0])): print "!!!!!!! Get PDF file to: "+self.pdfcheck.judgedirs.get(getfilelist[minnum][2],'.'),self.pdfcheck.realdoi getallfilelist.append(self.pdfcheck.judgedirs.get(getfilelist[minnum][2],'.')+os.sep+nowdoi.quote()+".pdf") del getfilelist[:] except Exception as e: print e, "##### Error when get pdf.." return getallfilelist
def renamecheck(self,fname,wtitle=0.65,cutoff=0.85,justcheck=False,resetfile=True,fdoi=None,excludedoi=None, fobj=None): '''A complex function to get doi from file name, check in crossref, check in pdf file, rename it! just check can cancel move file''' ### Result back: # 0: Done # 1: High # 2: Unsure # 3: Untitle # 4: Fail # 5: Page0 # 6: ErrorDOI # 10: Unknow if (resetfile and isinstance(fobj,(file,StringIO))): self.reset(fname="",fobj=fobj) fname="None" # len(self.doi) is 1 and len(self.doi - excludedoi) is 1 : # :: First Run and perform check # len(self.doi) is 1 or len(self.doi - excludedoi) is 1 : if (not fname and not fdoi): print "No given file name or doi! (Return 6)" return 6 if (fname and not fdoi and excludedoi): print "What do you want?! No excludedoi set by user! (Return 9)" return 9 if (resetfile and fname !="None"): self.reset(fname) elif(resetfile and not isinstance(fobj,(file,StringIO))): print "Use reset file but no file name/object is given!" return 9 if (self.maxpage == 0): if not justcheck: self.moveresult(5, printstr="Error Page 0 (Page0, R5): "+self._fname) return 5 if (not excludedoi): excludedoi=set() if (not fdoi): #File obj is "" fdoi=DOI(os.path.splitext(os.path.basename(self._fname))[0]) else: fdoi=DOI(fdoi) recursive= (len(excludedoi) > 0) # If in recursive, don't move file! if recursive: justcheck=True if resetfile and not recursive: self.realdoi=fdoi # Only find DOI in first time! if (not recursive and fdoi): self.finddoi(1) elif (not recursive and not fdoi): self.finddoi(set([1,2,self.maxpage])) # file doi is shit..Recursively use doi in file or fail if (not fdoi and not recursive): if (len(self.doi) is 1 or len(self.doi) is 2): print "Origin fdoi wrong but has 1~2 dois in file:",self._fname, return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck) # No doi or >2 dois in file else: if not justcheck: self.moveresult(4,printstr="Error fdoi and 0/too much doi. (Fail): "+self._fname) return 4 elif (not fdoi and recursive): print "doi (in recursion) may wrong with error doi. Should never happen.." return 4 # Fail # fdoi is ok cr=CRrecord() try: cr=cr.valid_doi(fdoi,fullparse=True) except requests.exceptions.RequestException as e: print e cr=None except Exception as e: print e cr=None # Error when year=None, improve in crrecord. #if (cr and not cr.year): # cr.year='8888' #crossref is ok if (fdoi and cr): totalpagenumber=1 try: totalpagenumber=self.totalpages(cr.pages) except ValueError as e: # should never happen now print e, cr.pages totalpagewrong=False #print "pages:",self.maxpage,' in crossref:',cr.pages,totalpagenumber if totalpagenumber>0 and not (self.maxpage >= totalpagenumber and self.maxpage <= totalpagenumber+2): totalpagewrong=True # When paper with supporting information if (self.maxpage > totalpagenumber+2): self.finddoi(page=2) if (self.withSI or (self.findtext('Supporting Information', page=[totalpagenumber+1,totalpagenumber+2]) and self.findtext(cr.title, similarity=0.75, page=[totalpagenumber+1,totalpagenumber+2]))): if not recursive : self.finddoi(totalpagenumber); self.withSI=True totalpagewrong=False # For NIH Public Access elif (self.hascontent("NIH Public Access")[0]): totalpagewrong=False #Such as some Nature with SI in paper without notify. elif (self.withSI or (totalpagenumber>1 and self.findtext("acknowledgment", page=[totalpagenumber-1, totalpagenumber]) and self.findtext("reference", page=[totalpagenumber-1, totalpagenumber]))): self.withSI=True totalpagewrong=False # Recursive but total page wrong. Fast end recursivedoicheck if (totalpagewrong and recursive): return 4 # Just check first page, not find(find before..), faster: doivalid=self.checkdoi(fdoi,page=1,iterfind=False,justcheck=True) titleeval=self.checktitle(cr.title) if (totalpagenumber > 0 and not totalpagewrong): if (doivalid and titleeval[0] and len(self.doi) is 1): # Yes! Very Good PDF! self.realdoi=fdoi if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 # Further check doi in page2/last, Finally, will check 1,2 and last pages. if (recursive): doivalid= ( self.checkdoi(fdoi,page=2,iterfind=True,justcheck=True) or doivalid ) else: doivalid= ( self.checkdoi(fdoi,page=2,iterfind=True) or doivalid ) if len(self.doi)>3: # Too much doi may be some abstract self.moveresult(2,printstr='Has more than 3 dois! (Unsure):'+self._fname) return 2 # Page wrong and try recursive use doi if (totalpagewrong): if (len(self.doi) is 1 or len(self.doi) is 2): doi=DOI(list(self.doi)[0]) # DOI in file is same so error. Don't need recursive if (len(self.doi) is 1 and doi == fdoi): if not justcheck: self.moveresult(4,printstr="PDF Page "+str(self.maxpage)+"!="+str(totalpagenumber)+"(Fail): "+self._fname) return 4 print 'Wrong total page with dois in file,',self._fname,fdoi,',try recursive' return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck) else: if not justcheck: self.moveresult(4,printstr="PDF Page "+str(self.maxpage)+"!="+str(totalpagenumber)+"(Fail): "+self._fname) return 4 if (not totalpagewrong): crscore=self.scorefitting(cr) if (self.maxpage <= totalpagenumber+2): # Maybe check when maxpage >total+2 titleeval=self.checktitle(cr.title) if cr.title.strip()=="": titleeval=(False,0.9) titlevalid=titleeval[0] try: paperyear=int(cr.year) except: paperyear=9999 try: # Too old maybe lost information if (paperyear>1990): titlevalid=titlevalid or (titleeval[1]*wtitle+crscore['total'])>=cutoff else: titlevalid=titlevalid or (titleeval[1]*wtitle+crscore['total'])>=cutoff-0.1 #(self.checktitle(cr.title,similarity=0.85) and self.checkcrossref(cr)) except Exception as e: print e if (doivalid): if (titlevalid): # Yes! Good PDF! self.realdoi=fdoi if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and len(self.doi) is 1 and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 print "Title/Paper score:",titleeval[1],crscore,self._fname if (len(self.doi - set([fdoi])) == 1 and not recursive): # Try one more newresult = self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=True) if (newresult is 0): newdoi=DOI(list(self.doi - set([fdoi]))[0]) self.realdoi=newdoi print if not justcheck: self.moveresult(0, printstr="(Rename)fdoi ok, but not title. In file doi "+newdoi+" is better for "+self._fname, newfname=newdoi.quote()+".pdf") return 0 # Else DOI ok but not title if not justcheck: self.moveresult(3,printstr="OK fdoi but not title(Untitle): "+self._fname) return 3 # Indeed, doi maybe in pdf, but strange format.. if (self.checkdoinormaltxt(fdoi)): if (titlevalid): # Further check only when title OK if (self.checkdoifurther(fdoi)): # Fine! move to Done dir if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and len(self.doi) is 1 and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 else: # Can't find, but high similar! move to High dir if not justcheck: self.moveresult(1,printstr="OK title and nospacebreak doi,but not pass(High): "+self._fname) return 1 else: # DOI ok but not title print "Title/Paper score:",titleeval[1],crscore,self._fname if not justcheck: self.moveresult(3,printstr="Maybe OK fdoi but not title(Untitle): "+self._fname) return 3 # DOI maybe not exist .... if (titlevalid): tmpdois=set(self.doi) for d in tmpdois: dd=DOI(d) if ( not dd.valid_doiorg(geturl=False) ): self.doi.remove(d) # Old paper don't have doi... if len(self.doi) is 0 and totalpagenumber>0: if (crscore['total'] >= 0.4): if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 elif (titleeval[1]>=0.85 and crscore['total'] >= 0.35): if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 elif (titleeval[1]>=0.95 and crscore['total'] >=0.3): if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 elif (titleeval[1]>=0.90 and crscore['pages']>=0.9 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9)): if not justcheck: self.moveresult(0) return 0 elif (titleeval[1]>=0.90 and crscore['pages']>=0.5 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9) and crscore['authors']>=0.7): if not justcheck: self.moveresult(0) return 0 elif (titleeval[1]>=0.75 or crscore['total'] >=0.25): print "Title/Paper score:",titleeval[1],crscore,self._fname if not justcheck: self.moveresult(1,printstr="OK title and high info fit. But no doi(Highly): "+self._fname) return 1 else: print "Title/Paper score:",titleeval[1],crscore,self._fname if not justcheck: self.moveresult(2,printstr="OK title and ok info fit. But no doi(Unsure): "+self._fname) return 2 elif len(self.doi) is 0 and totalpagenumber== -1: if (titleeval[1]>=0.90 and crscore['pages']>=0.5 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9) and crscore['authors']>=0.7): if not justcheck: self.moveresult(0) return 0 else: print "Title/Paper score:",titleeval[1],crscore,self._fname if not justcheck: self.moveresult(2,printstr="OK title and high info fit. But no doi and no total pages(Unsure): "+self._fname) return 2 elif len(self.doi) is 0 and totalpagenumber<=0: print "Title/Paper score:",titleeval[1],crscore,self._fname if not justcheck: self.moveresult(2,printstr="OK title and high info fit. But no doi and no total pages(Unsure): "+self._fname) return 2 elif ( len(self.doi) > 0 and not recursive): print "Good title but file doesn't contain fdoi, however it has >0 doi in file. " outnow=self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=True) if outnow > 0: if not justcheck: self.moveresult(2,printstr="OK title but not fdoi. In file doi is not good(Unsure): "+self._fname) return 2 elif(outnow==0): print 'Good Title but Fail fdoi. Paper has good in file doi,',self._fname,',try recursive' return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck) ### Old method check old items: #if (self.checkcrossref(cr)): # if (int(cr.year)<=1999 and len(self.doi) is 0): # # Highly possible right # if not justcheck: self.movetodir("High") # return True # Bentham, often blank doi # elif (fdoi[:8] == '10.2174/' and len(self.doi) is 0): # if not justcheck: self.movetodir("Done") # return True # elif (len(self.doi) is 0): # print "Title/Paper score:",titleeval[1],crscore,self._fname # if not justcheck: # self.moveresult(1,printstr="OK title and high info fit. But no doi(Highly): "+self._fname) # return 1 # else: # if not justcheck: # self.moveresult(2,printstr="OK title and high info fit. But doi exist not fit(Unsure): "+self._fname) # return 2 #elif(len(self.doi) is 0): # # Maybe wrong file and no doi # if not justcheck: # self.moveresult(2,printstr="Not found doi in file but ok title (Unsure): "+self._fname) # return 2 #fdoi,title wrong, no doi in file # Or in recursive mode if (len(self.doi) is 0 or recursive): if not justcheck: self.moveresult(4,printstr="Both fdoi and title wrong, no doi in file(Fail): "+self._fname) return 4 # Indeed, file has only one more doi, not the same to fname if (len(self.doi - set([fdoi])) is 1 ): print 'Fail fdoi/title. Paper with one more doi in file,',self._fname,',try recursive' return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck) elif(len(self.doi) > 1): if not justcheck: self.moveresult(4,printstr="fdoi/title fail. Too much infile doi(Fail): "+self._fname) return 4 else: if not justcheck: self.moveresult(4,printstr="What????? What?????(Fail):"+self._fname) return 4 # not cr else: if (not recursive): self.finddoi(set([1,2,self.maxpage])) if (len(self.doi) is 1 or len(self.doi) is 2): print 'Error DOI filename,',self._fname,',try recursive' return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck) if not justcheck: self.moveresult(6,"Error DOI fname(Fail):"+self._fname) return 6