def grepBingAcadPDFbyID(self,bid,maxpage=1,printyn=True): '''Grep at most maxpage pages pdf for given bing id Save to doi style based on refering to crossref.''' if (printyn): print "### ### ### ### ### ### ### ### ### " print "## Finding for "+bid+"...." cr=CRrecord() ref=self.bidref(bid) if (printyn): print ref if (os.path.exists(bid+".pdf")): print "Exist file:"+bid+".pdf" return if ref['title']: if (cr.getfromtitle(title=ref['title'],year=ref['year'],volume=ref['volume'], pages=ref['pages'],issue=ref['issue'],fullparse=False) and cr.doi): # try to find by title, if found (true): if (printyn): print cr outname=quotefileDOI(cr.doi)+".pdf" if (not os.path.exists(outname)): if (self.getbidpdf(bid,filename=outname,printyn=printyn)): print "Have Found PDF file: "+outname else: print "Exist file:"+outname else: if (self.getbidpdf(bid,filename=bid+".pdf",printyn=printyn)): print "Have Found PDF file: "+bid+".pdf" else: if (self.getbidpdf(bid,filename=bid+".pdf",printyn=printyn)): print "Have Found PDF file: "+bid+".pdf"
def finddoi(self, num, prefix='', issn=''): title = self.gettitle(num) doi = DOI(self.getdoi(num)) if (not prefix): prefix = doi.split('/', 1)[0] if doi else "" volume = self.getvolume(num) journal = self.getjournalfull(num) year = self.getyear(num) pages = self.getpages(num) self.cr = CRrecord() try: # The origin doi maybe true. Find in crossref if (doi and self.cr.getfromdoi(doi, fullparse=False) and self.cr.doi): # Further check title if (strdiff(doi,self.cr.doi)>=0.85 and \ strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75): return doi if (volume and pages): ops = pages.split('-') crps = self.cr.pages.split('-') if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0] and volume == self.cr.volume): return doi if (year and pages): ops = pages.split('-') crps = self.cr.pages.split('-') if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0] and year == self.cr.year): return doi print "Origin DOI:", doi, "may be true but record strange..Try title" keyword = title + " " + journal + " " + year + " " + pages + " " + volume if (self.cr.getfromtitledoi(keyword, doi, year=year, limit=10, fullparse=False, prefix=prefix)): if (doi): if (prefix == self.cr.doi.split('/')[0] and strdiff(doi, self.cr.doi) >= 0.85): return self.cr.doi else: print "Error for origin doi: " + doi + "; found: " + self.cr.doi return "" return self.cr.doi if (doi): if (strdiff(doi, self.cr.doi) >= 0.85): return self.cr.doi else: print "Error2 for origin doi: " + doi + "; found: " + self.cr.doi return "" else: return "" except Exception as e: print "Error when find doi..", e, "\nRetry..." return self.finddoi(num, prefix=prefix, issn=issn)
def findcrossreftitledoi(self,doi,printyn=True): '''Find doi by crossref first''' cr=CRrecord() if( cr.getfromdoi(doi,fullparse=False) and cr.doi): keyword=(cr.title+" "+cr.doi).encode('utf-8') print "#########################################################################" print "## Now finding for doi with title: "+ keyword.encode('utf-8')+"............" sys.stdout.flush() self.grepBingAcadPDF(keyword=keyword,maxpage=1,printyn=printyn) else: print "Error DOI!: "+doi cr.reset()
def getdoi(self,num=0): '''Get DOI from Baidu Cite''' soup=BeautifulSoup(self.getcite(num,citetype='txt'),"html.parser") if (soup.doi): doi=soup.doi.text elif(soup.primarytitle): cr=CRrecord() cr.getfromtitle(soup.primarytitle.info.text,ignorecheminfo=True) doi=cr.doi else: doi=DOI("") return DOI(doi[doi.find('10.'):])
def findcrossreftitledoi(self,doi,printyn=True): '''Find doi by crossref first''' cr=CRrecord() if( cr.getfromdoi(doi,fullparse=False) and cr.doi): keyword=cr.title+" "+cr.doi print "#########################################################################" print "## Now finding for doi with title: "+ keyword.encode('utf-8')+"............" sys.stdout.flush() self.search(keyword=keyword) self.getallpdf() else: print "Error DOI!: "+doi cr.reset()
def finddoi(self,num,prefix='',issn=''): title=self.gettitle(num) doi=DOI(self.getdoi(num)) if (not prefix): prefix = doi.split('/',1)[0] if doi else "" volume= self.getvolume(num) journal=self.getjournalfull(num) year=self.getyear(num) pages=self.getpages(num) self.cr=CRrecord() try: # The origin doi maybe true. Find in crossref if ( doi and self.cr.getfromdoi(doi,fullparse=False) and self.cr.doi): # Further check title if (strdiff(doi,self.cr.doi)>=0.85 and \ strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75): return doi if( volume and pages ): ops=pages.split('-') crps=self.cr.pages.split('-') if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and volume==self.cr.volume): return doi if( year and pages ): ops=pages.split('-') crps=self.cr.pages.split('-') if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and year==self.cr.year): return doi print "Origin DOI:",doi,"may be true but record strange..Try title" keyword=title+" "+journal+" "+year+" "+pages+" "+volume if (self.cr.getfromtitledoi(keyword,doi,year=year,limit=10,fullparse=False,prefix=prefix)): if (doi): if( prefix == self.cr.doi.split('/')[0] and strdiff(doi,self.cr.doi)>=0.85): return self.cr.doi else: print "Error for origin doi: "+doi+"; found: "+self.cr.doi return "" return self.cr.doi if (doi): if( strdiff(doi,self.cr.doi)>=0.85): return self.cr.doi else: print "Error2 for origin doi: "+doi+"; found: "+self.cr.doi return "" else: return "" except Exception as e: print "Error when find doi..",e,"\nRetry..." return self.finddoi(num,prefix=prefix,issn=issn)
def findPDFbyISSN(self,issn,maxresult=None, step=100, offset=0): '''Find PDF by ISSN based on search result from crossref''' # may be improve to not only issn.. if (not issn):return needurl="http://api.crossref.org/journals/"+issn+"/works" cr=CRrecord() total=cr.gettotalresultfromlink(needurl) if (not maxresult or maxresult <=0 or maxresult>total): maxresult=total params={"rows":str(step)} maxround=(maxresult-offset)/step+1 offsetcount=offset for i in range(maxround): params["offset"]=str(step*i+offset) r=requests.get(needurl,params,timeout=timeout_setting_download) if (r.status_code is 200): for j in r.json()['message']['items']: keyword=j.get('title',[''])[0]+" "+j.get("DOI","") print "#####################################",offsetcount,"####################################" print "## Now finding for doi with title: "+ keyword.encode('utf-8')+"............" sys.stdout.flush() bingacad.grepBingAcadPDF(keyword.encode('utf-8')) offsetcount+=1 gc.collect()
class EndnoteXML(object): def __init__(self, fname): if (fname): f = open(fname) self.content = re.sub(r'</?style.*?>', '', f.read()) f.close() else: self.content = "" self.soup = BeautifulSoup(self.content, 'html.parser') self.records = self.soup.records.contents self.length = len(self.records) for i in range(self.length): self.checktag(i, 'titles') self.checktag(i, 'authors') self.checktag(i, 'urls') if (self.records[i].find('related-urls') is None): self.addtag(i, 'related-urls', '', parent='urls') if (self.records[i].find('pdf-urls') is None): self.addtag(i, 'pdf-urls', '', parent='urls') self.checktag(i, 'dates') self.setdoi(i, self.getdoi(i)) #def __repr__(self): # return self.soup.encode() def __str__(self): return self.soup.encode() def reset(self, fname): self.__init__(fname) def read(self, fname): self.__init__(fname) def reads(self, s): self.content = s self.soup = BeautifulSoup(self.content, 'html.parser') self.records = self.soup.records.contents self.length = len(self.records) for i in range(self.length): self.checktag(i, 'titles') self.checktag(i, 'authors') self.checktag(i, 'urls') if (self.records[i].find('related-urls') is None): self.addtag(i, 'related-urls', '', parent='urls') if (self.records[i].find('pdf-urls') is None): self.addtag(i, 'pdf-urls', '', parent='urls') self.checktag(i, 'dates') self.setdoi(i, self.getdoi(i)) def writes(self, encoding='utf-8'): return self.soup.encode(encoding=encoding) def write(self, fname, encoding='utf-8'): f = open(fname, 'w') f.write(self.writes(encoding=encoding)) f.close() def getrecord(self, num): if (num >= self.length): return None return self.records[num] def checktag(self, num, tag): if self.records[num].find(tag) is None: self.addtag(num, tag, value='') def addtag(self, num, tag, value=None, parent=None): '''value can be string, tag''' a = self.soup.new_tag(tag) if value: a.string = value if parent: self.records[num].find(parent).append(a) else: self.records[num].append(a) def gettag(self, num, tag, parent=None, obj=False): if parent: if self.records[num].find(parent): if self.records[num].find(parent).find(tag): if (obj): return self.records[num].find(parent).find(tag) else: return self.records[num].find(parent).find(tag).string else: return '' else: return '' else: if self.records[num].find(tag): if (obj): return self.records[num].find(tag) else: return self.records[num].find(tag).string else: return '' def settag(self, num, tag, value, parent=None): if parent: if self.records[num].find(parent): if self.records[num].find(parent).find(tag): self.records[num].find(parent).find(tag).string = value else: self.addtag(num, tag, parent=parent, value=value) else: a = self.soup.new_tag(tag) a.string = value self.addtag(num, parent, parent=None, value=a) else: if self.records[num].find(tag): self.records[num].find(tag).string = value else: self.addtag(num, tag, parent=None, value=value) def getpath(self): db = self.soup.findChild("database") if (db): return os.path.splitext(db['path'])[0] + '.Data' else: return "" def getdoi(self, num): doistr = self.gettag(num, "electronic-resource-num") if (doistr): doiindex = doistr.find('10.') else: doiindex = -1 if (doiindex >= 0): return doistr[doiindex:].lower().strip() else: return "" def setdoi(self, num, value): self.settag(num, "electronic-resource-num", value) def gettitle(self, num): return self.gettag(num, "title") def settitle(self, num, value): self.settag(num, "title", value) def getjournalfull(self, num): return self.gettag(num, 'secondary-title') def getyear(self, num): return self.gettag(num, 'year', 'dates') def setyear(self, num, value): self.settag(num, 'year', value, 'dates') def getvolume(self, num): return self.gettag(num, 'volume') def setvolume(self, num, value): self.settag(num, 'volume', value) def getissue(self, num): return self.gettag(num, 'number') def setissue(self, num, value): self.settag(num, 'number', value) def getpages(self, num): return self.gettag(num, 'pages') def setpages(self, num, value): self.settag(num, 'pages', value) def getnotes(self, num): return self.gettag(num, 'notes') def setnotes(self, num, value): self.settag(num, 'notes', value) def geturl(self, num): urls = self.gettag(num, 'related-urls', obj=True) if (urls): return [i.string for i in urls.find_all('url')] else: return [] def seturl(self, num, value): '''Note that it will clean all the url!''' if (self.soup.find('related-urls') is not None): urls = self.gettag(num, 'related-urls', obj=True) if (urls): urls.clear() else: self.addtag(num, 'related-urls', parent='urls') self.addtag(num, 'url', value, 'related-urls') def addurl(self, num, value, first=False): urls = self.gettag(num, 'related-urls', obj=True) a = self.soup.new_tag('url') a.string = value if (urls): if (not first): urls.append(a) else: urls.insert(0, a) else: self.settag(num, 'related-urls', a, 'urls') def getpdf(self, num): urls = self.gettag(num, 'pdf-urls', obj=True) if (urls): return [i.string for i in urls.find_all('url')] else: return [] def setpdf(self, num, value): '''Note that it will clean all the url!''' if (self.soup.find('pdf-urls') is not None): urls = self.gettag(num, 'pdf-urls', obj=True) if (urls): urls.clear() else: self.addtag(num, 'pdf-urls', parent='urls') self.addtag(num, 'url', value, 'pdf-urls') def setpdfs(self, num, value): '''Note that it will clean all the url!''' if (self.soup.find('pdf-urls') is not None): urls = self.gettag(num, 'pdf-urls', obj=True) if (urls): urls.clear() else: self.addtag(num, 'pdf-urls', parent='urls') for url in value: self.addtag(num, 'url', url, 'pdf-urls') def addpdf(self, num, value, first=False): urls = self.gettag(num, 'pdf-urls', obj=True) a = self.soup.new_tag('url') a.string = value if (urls): if (not first): urls.append(a) else: urls.insert(0, a) else: self.addtag(num, 'pdf-urls', a, 'urls') def finddoi(self, num, prefix='', issn=''): title = self.gettitle(num) doi = DOI(self.getdoi(num)) if (not prefix): prefix = doi.split('/', 1)[0] if doi else "" volume = self.getvolume(num) journal = self.getjournalfull(num) year = self.getyear(num) pages = self.getpages(num) self.cr = CRrecord() try: # The origin doi maybe true. Find in crossref if (doi and self.cr.getfromdoi(doi, fullparse=False) and self.cr.doi): # Further check title if (strdiff(doi,self.cr.doi)>=0.85 and \ strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75): return doi if (volume and pages): ops = pages.split('-') crps = self.cr.pages.split('-') if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0] and volume == self.cr.volume): return doi if (year and pages): ops = pages.split('-') crps = self.cr.pages.split('-') if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0] and year == self.cr.year): return doi print "Origin DOI:", doi, "may be true but record strange..Try title" keyword = title + " " + journal + " " + year + " " + pages + " " + volume if (self.cr.getfromtitledoi(keyword, doi, year=year, limit=10, fullparse=False, prefix=prefix)): if (doi): if (prefix == self.cr.doi.split('/')[0] and strdiff(doi, self.cr.doi) >= 0.85): return self.cr.doi else: print "Error for origin doi: " + doi + "; found: " + self.cr.doi return "" return self.cr.doi if (doi): if (strdiff(doi, self.cr.doi) >= 0.85): return self.cr.doi else: print "Error2 for origin doi: " + doi + "; found: " + self.cr.doi return "" else: return "" except Exception as e: print "Error when find doi..", e, "\nRetry..." return self.finddoi(num, prefix=prefix, issn=issn) def preprocess(self): pass def cleannote(self, num): note = self.getnotes(num) notel = note.lower() if ("time" in notel): self.setnotes(num, notel[notel.find('time'):]) def cleanallpdf(self, exceptOAPDF=True): '''Clean PDF record or except OAPDF record''' for i in range(self.length): if (not exceptOAPDF): self.setpdf(i, '') else: for pdf in self.getpdf(i): if "internal-pdf://OAPDF/" in pdf: self.setpdf(i, pdf) break def process(self, fname="", cleannote=False, prefix='', issn='', start=0): epath = self.getpath() print "Output", self.length, "to", epath + os.sep + fname for i in range(start, self.length): try: #if (i%100 is 0): # print # print "Doing:",i+1, #else: # print i+1, pdfs = self.getpdf(i) urls = self.geturl(i) # Fast consider as record process before hasfound = False for pdf in pdfs: if "internal-pdf://OAPDF/" in pdf: hasfound = True doistr = self.gettag(i, "electronic-resource-num") if (doistr and len(doistr) > 4 and doistr[:4] == 'chk:'): doi = DOI(self.getdoi(i)) if doi: self.setdoi(i, "chk: " + doi) break if not hasfound: for url in urls: if "http://oapdf.sourceforge.net/cgi-bin/" in url: hasfound = True doistr = self.gettag(i, "electronic-resource-num") if (doistr and len(doistr) > 4 and doistr[:4] == 'chk:'): doi = DOI(self.getdoi(i)) if doi: self.setdoi(i, "chk: " + doi) break if hasfound: continue if (cleannote): self.cleannote(i) doistr = self.gettag(i, "electronic-resource-num") if (doistr and len(doistr) > 4 and doistr[:4] == 'chk:'): doi = DOI(self.getdoi(i)) else: doi = DOI(self.finddoi(i, prefix=prefix, issn=issn)) if doi: self.setdoi(i, "chk: " + doi) oapdflink = "" if (doi and doi.is_oapdf()): oapdflink = "http://oapdf.sourceforge.net/cgi-bin/doipage.cgi?doi=" + doi newpdfs = [] for pdf in pdfs: pdfpath = pdf.replace("internal-pdf://", epath + os.sep + "PDF" + os.sep) relpath = pdf.replace("internal-pdf://", "") # should never happen if (relpath == doi.quote() + ".pdf"): newpdfs.append(pdf) continue if (doi): if (os.path.exists(pdfpath)): try: os.renames( pdfpath, epath + os.sep + "PDF" + os.sep + doi.quote() + ".pdf") newpdfs.append("internal-pdf://" + doi.quote() + ".pdf") except: print "Can't rename:", pdf, 'to', doi.quote( ) + ".pdf" newpdfs.append(pdf) continue else: print "Maybe error for the record", doi, "with pdf path:", pdf, '; Try finding..', pdfdir = os.path.split(pdfpath)[0] if (os.path.exists(pdfdir)): fs = glob.glob(pdfdir + os.sep + '*.pdf') if (len(fs) == 1): try: os.renames( fs[0], epath + os.sep + "PDF" + os.sep + doi.quote() + ".pdf") newpdfs.append("internal-pdf://" + doi.quote() + ".pdf") print "Find", fs[0], 'and rename!' except: print "Can't rename:", fs[ 0], 'to', doi.quote() + ".pdf" newpdfs.append(pdf) continue else: print "Can't find.." newpdfs.append(pdf) continue else: newpdfs.append(pdf) continue else: print "Blank doi for file:", pdf newpdfs.append(pdf) continue if (oapdflink): newpdfs.append("internal-pdf://OAPDF/" + doi.quote() + ".pdf") self.setpdfs(i, newpdfs) # Set the urls if (oapdflink and oapdflink not in urls): self.addurl(i, oapdflink, first=True) except Exception as e: print "Error at ", i, 'since: ', e #return 1 if fname: self.write(fname) return 0
class EndnoteXML(object): def __init__(self,fname): if (fname): f=open(fname) self.content=re.sub(r'</?style.*?>','',f.read()) f.close() else: self.content="" self.soup=BeautifulSoup(self.content,'html.parser') self.records=self.soup.records.contents self.length=len(self.records) for i in range(self.length): self.checktag(i,'titles') self.checktag(i,'authors') self.checktag(i,'urls') if (self.records[i].find('related-urls') is None): self.addtag(i,'related-urls','',parent='urls') if (self.records[i].find('pdf-urls') is None): self.addtag(i,'pdf-urls','',parent='urls') self.checktag(i,'dates') self.setdoi(i,self.getdoi(i)) #def __repr__(self): # return self.soup.encode() def __str__(self): return self.soup.encode() def reset(self,fname): self.__init__(fname) def read(self,fname): self.__init__(fname) def reads(self,s): self.content=s self.soup=BeautifulSoup(self.content,'html.parser') self.records=self.soup.records.contents self.length=len(self.records) for i in range(self.length): self.checktag(i,'titles') self.checktag(i,'authors') self.checktag(i,'urls') if (self.records[i].find('related-urls') is None): self.addtag(i,'related-urls','',parent='urls') if (self.records[i].find('pdf-urls') is None): self.addtag(i,'pdf-urls','',parent='urls') self.checktag(i,'dates') self.setdoi(i,self.getdoi(i)) def writes(self,encoding='utf-8'): return self.soup.encode(encoding=encoding) def write(self,fname,encoding='utf-8'): f=open(fname,'w') f.write(self.writes(encoding=encoding)) f.close() def getrecord(self,num): if (num>=self.length): return None return self.records[num] def checktag(self,num,tag): if self.records[num].find(tag) is None: self.addtag(num,tag,value='') def addtag(self,num,tag,value=None,parent=None): '''value can be string, tag''' a=self.soup.new_tag(tag) if value: a.string=value if parent: self.records[num].find(parent).append(a) else: self.records[num].append(a) def gettag(self,num,tag,parent=None,obj=False): if parent: if self.records[num].find(parent): if self.records[num].find(parent).find(tag): if (obj): return self.records[num].find(parent).find(tag) else: return self.records[num].find(parent).find(tag).string else: return '' else: return '' else: if self.records[num].find(tag): if (obj): return self.records[num].find(tag) else: return self.records[num].find(tag).string else: return '' def settag(self,num,tag,value,parent=None): if parent: if self.records[num].find(parent): if self.records[num].find(parent).find(tag): self.records[num].find(parent).find(tag).string=value else: self.addtag(num,tag,parent=parent,value=value) else: a=self.soup.new_tag(tag) a.string=value self.addtag(num,parent,parent=None,value=a) else: if self.records[num].find(tag): self.records[num].find(tag).string=value else: self.addtag(num,tag,parent=None,value=value) def getpath(self): db=self.soup.findChild("database") if (db): return os.path.splitext(db['path'])[0]+'.Data' else: return "" def getdoi(self,num): doistr=self.gettag(num,"electronic-resource-num") if (doistr): doiindex=doistr.find('10.') else: doiindex=-1 if (doiindex >=0): return doistr[doiindex:].lower().strip() else: return "" def setdoi(self,num,value): self.settag(num,"electronic-resource-num",value) def gettitle(self,num): return self.gettag(num,"title") def settitle(self,num,value): self.settag(num,"title",value) def getjournalfull(self,num): return self.gettag(num,'secondary-title') def getyear(self,num): return self.gettag(num,'year','dates') def setyear(self,num,value): self.settag(num,'year',value,'dates') def getvolume(self,num): return self.gettag(num,'volume') def setvolume(self,num,value): self.settag(num,'volume',value) def getissue(self,num): return self.gettag(num,'number') def setissue(self,num,value): self.settag(num,'number',value) def getpages(self,num): return self.gettag(num,'pages') def setpages(self,num,value): self.settag(num,'pages',value) def getnotes(self,num): return self.gettag(num,'notes') def setnotes(self,num,value): self.settag(num,'notes',value) def geturl(self,num): urls=self.gettag(num,'related-urls',obj=True) if (urls): return [ i.string for i in urls.find_all('url') ] else: return [] def seturl(self,num,value): '''Note that it will clean all the url!''' if (self.soup.find('related-urls') is not None): urls=self.gettag(num,'related-urls',obj=True) if (urls): urls.clear() else: self.addtag(num,'related-urls',parent='urls') self.addtag(num,'url',value,'related-urls') def addurl(self,num,value,first=False): urls=self.gettag(num,'related-urls',obj=True) a=self.soup.new_tag('url') a.string=value if (urls): if (not first): urls.append(a) else: urls.insert(0,a) else: self.settag(num,'related-urls',a,'urls') def getpdf(self,num): urls=self.gettag(num,'pdf-urls',obj=True) if (urls): return [ i.string for i in urls.find_all('url') ] else: return [] def setpdf(self,num,value): '''Note that it will clean all the url!''' if (self.soup.find('pdf-urls') is not None): urls=self.gettag(num,'pdf-urls',obj=True) if (urls): urls.clear() else: self.addtag(num,'pdf-urls',parent='urls') self.addtag(num,'url',value,'pdf-urls') def setpdfs(self,num,value): '''Note that it will clean all the url!''' if (self.soup.find('pdf-urls') is not None): urls=self.gettag(num,'pdf-urls',obj=True) if (urls): urls.clear() else: self.addtag(num,'pdf-urls',parent='urls') for url in value: self.addtag(num,'url',url,'pdf-urls') def addpdf(self,num,value,first=False): urls=self.gettag(num,'pdf-urls',obj=True) a=self.soup.new_tag('url') a.string=value if (urls): if (not first): urls.append(a) else: urls.insert(0,a) else: self.addtag(num,'pdf-urls',a,'urls') def finddoi(self,num,prefix='',issn=''): title=self.gettitle(num) doi=DOI(self.getdoi(num)) if (not prefix): prefix = doi.split('/',1)[0] if doi else "" volume= self.getvolume(num) journal=self.getjournalfull(num) year=self.getyear(num) pages=self.getpages(num) self.cr=CRrecord() try: # The origin doi maybe true. Find in crossref if ( doi and self.cr.getfromdoi(doi,fullparse=False) and self.cr.doi): # Further check title if (strdiff(doi,self.cr.doi)>=0.85 and \ strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75): return doi if( volume and pages ): ops=pages.split('-') crps=self.cr.pages.split('-') if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and volume==self.cr.volume): return doi if( year and pages ): ops=pages.split('-') crps=self.cr.pages.split('-') if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and year==self.cr.year): return doi print "Origin DOI:",doi,"may be true but record strange..Try title" keyword=title+" "+journal+" "+year+" "+pages+" "+volume if (self.cr.getfromtitledoi(keyword,doi,year=year,limit=10,fullparse=False,prefix=prefix)): if (doi): if( prefix == self.cr.doi.split('/')[0] and strdiff(doi,self.cr.doi)>=0.85): return self.cr.doi else: print "Error for origin doi: "+doi+"; found: "+self.cr.doi return "" return self.cr.doi if (doi): if( strdiff(doi,self.cr.doi)>=0.85): return self.cr.doi else: print "Error2 for origin doi: "+doi+"; found: "+self.cr.doi return "" else: return "" except Exception as e: print "Error when find doi..",e,"\nRetry..." return self.finddoi(num,prefix=prefix,issn=issn) def preprocess(self): pass def cleannote(self,num): note=self.getnotes(num) notel=note.lower() if ("time" in notel): self.setnotes(num,notel[notel.find('time'):]) def cleanallpdf(self,exceptOAPDF=True): '''Clean PDF record or except OAPDF record''' for i in range(self.length): if (not exceptOAPDF): self.setpdf(i,'') else: for pdf in self.getpdf(i): if "internal-pdf://OAPDF/" in pdf: self.setpdf(i,pdf) break def process(self,fname="",cleannote=False,prefix='',issn='',start=0): epath=self.getpath() print "Output",self.length,"to",epath+os.sep+fname for i in range(start,self.length): try: #if (i%100 is 0): # print # print "Doing:",i+1, #else: # print i+1, pdfs=self.getpdf(i) urls=self.geturl(i) # Fast consider as record process before hasfound=False for pdf in pdfs: if "internal-pdf://OAPDF/" in pdf: hasfound=True doistr=self.gettag(i,"electronic-resource-num") if (doistr and len(doistr)>4 and doistr[:4]=='chk:'): doi=DOI(self.getdoi(i)) if doi: self.setdoi(i,"chk: "+doi) break if not hasfound: for url in urls: if "http://oapdf.sourceforge.net/cgi-bin/" in url: hasfound=True doistr=self.gettag(i,"electronic-resource-num") if (doistr and len(doistr)>4 and doistr[:4]=='chk:'): doi=DOI(self.getdoi(i)) if doi: self.setdoi(i,"chk: "+doi) break if hasfound: continue if (cleannote): self.cleannote(i) doistr=self.gettag(i,"electronic-resource-num") if (doistr and len(doistr)>4 and doistr[:4]=='chk:'): doi=DOI(self.getdoi(i)) else: doi=DOI(self.finddoi(i,prefix=prefix,issn=issn)) if doi: self.setdoi(i,"chk: "+doi) oapdflink="" if (doi and doi.is_oapdf()): oapdflink="http://oapdf.sourceforge.net/cgi-bin/doipage.cgi?doi="+doi newpdfs=[] for pdf in pdfs: pdfpath=pdf.replace("internal-pdf://",epath+os.sep+"PDF"+os.sep) relpath=pdf.replace("internal-pdf://","") # should never happen if (relpath == doi.quote()+".pdf"): newpdfs.append(pdf) continue if (doi): if (os.path.exists(pdfpath)): try: os.renames(pdfpath,epath+os.sep+"PDF"+os.sep+doi.quote()+".pdf") newpdfs.append("internal-pdf://"+doi.quote()+".pdf") except: print "Can't rename:",pdf,'to',doi.quote()+".pdf" newpdfs.append(pdf) continue else: print "Maybe error for the record",doi,"with pdf path:",pdf,'; Try finding..', pdfdir=os.path.split(pdfpath)[0] if (os.path.exists(pdfdir)): fs=glob.glob(pdfdir+os.sep+'*.pdf') if (len(fs)==1): try: os.renames(fs[0],epath+os.sep+"PDF"+os.sep+doi.quote()+".pdf") newpdfs.append("internal-pdf://"+doi.quote()+".pdf") print "Find",fs[0],'and rename!' except: print "Can't rename:",fs[0],'to',doi.quote()+".pdf" newpdfs.append(pdf) continue else: print "Can't find.." newpdfs.append(pdf) continue else: newpdfs.append(pdf) continue else: print "Blank doi for file:",pdf newpdfs.append(pdf) continue if (oapdflink): newpdfs.append("internal-pdf://OAPDF/"+doi.quote()+".pdf") self.setpdfs(i,newpdfs) # Set the urls if (oapdflink and oapdflink not in urls): self.addurl(i,oapdflink,first=True) except Exception as e: print "Error at ", i, 'since: ',e #return 1 if fname: self.write(fname) return 0
def findPDFbyISSN(self,issn,maxresult=None, step=100, offset=0, usedoi=True,doifilter=None,onlinecheck=True,savestate=None,proxy=None,usebdcheck=True): '''Find PDF by ISSN based on search result from crossref''' # may be improve to not only issn.. if (not issn):return if (len(issn)==9 and issn[4]=='-'): needurl="http://api.crossref.org/journals/"+issn+"/works" elif('10.' in issn): needurl="http://api.crossref.org/prefixes/"+issn+"/works" else: print "Error ISSN/prefix" sys.exit(1) cr=CRrecord() total=cr.gettotalresultfromlink(needurl) if (not maxresult or maxresult <=0 or maxresult>total): maxresult=total params={"rows":str(step)} maxround=(maxresult-offset)/step+1 offsetcount=offset bdcheck=BDCheck() for i in range(maxround): params["offset"]=str(step*i+offset) r=requests.get(needurl,params,timeout=timeout_setting_download) if (r.status_code is 200): # Get all check/in oapdf if usebdcheck: bdcheckall=bdcheck.filterdois(r.json(),oapdf=1,crjson=True) for j in r.json().get('message',{}).get('items',[]): keyword=j.get('title',['']) doi=DOI(j.get("DOI","")) if not doi: offsetcount+=1 time.sleep(2) continue # Check whether in bdcheck if (usebdcheck and doi in bdcheckall): print doi, 'has search/oapdf/free by bdcheck' offsetcount+=1 time.sleep(1) continue # If not in bdcheck, check oapdf/free and set it # TODO: remove it after combine oapdf information to library oapdffree=bdcheck.setbycheck(doi) if (oapdffree[0] or oapdffree[1]): print doi,'exist in oapdf/free library..' offsetcount+=1 time.sleep(1) continue if (keyword): keyword=keyword[0] else: time.sleep(2) offsetcount+=1 continue if usedoi:keyword+=" "+doi print "#####################################",offsetcount,"####################################" print "## Now finding for doi with title:"+doi+" "+ keyword.encode('utf-8')+"............" sys.stdout.flush() self.search(keyword.encode('utf-8'),proxy=proxy) bdresult=self.getallpdf(doifilter,onlinecheck=onlinecheck,savestate=savestate,usebdcheck=usebdcheck) bdcheck.set(doi) offsetcount+=1 gc.collect() print "End of process for",issn
def renamecheck(self,fname,wtitle=0.65,cutoff=0.85,justcheck=False,resetfile=True,fdoi=None,excludedoi=None, fobj=None): '''A complex function to get doi from file name, check in crossref, check in pdf file, rename it! just check can cancel move file''' ### Result back: # 0: Done # 1: High # 2: Unsure # 3: Untitle # 4: Fail # 5: Page0 # 6: ErrorDOI # 10: Unknow if (resetfile and isinstance(fobj,(file,StringIO))): self.reset(fname="",fobj=fobj) fname="None" # len(self.doi) is 1 and len(self.doi - excludedoi) is 1 : # :: First Run and perform check # len(self.doi) is 1 or len(self.doi - excludedoi) is 1 : if (not fname and not fdoi): print "No given file name or doi! (Return 6)" return 6 if (fname and not fdoi and excludedoi): print "What do you want?! No excludedoi set by user! (Return 9)" return 9 if (resetfile and fname !="None"): self.reset(fname) elif(resetfile and not isinstance(fobj,(file,StringIO))): print "Use reset file but no file name/object is given!" return 9 if (self.maxpage == 0): if not justcheck: self.moveresult(5, printstr="Error Page 0 (Page0, R5): "+self._fname) return 5 if (not excludedoi): excludedoi=set() if (not fdoi): #File obj is "" fdoi=DOI(os.path.splitext(os.path.basename(self._fname))[0]) else: fdoi=DOI(fdoi) recursive= (len(excludedoi) > 0) # If in recursive, don't move file! if recursive: justcheck=True if resetfile and not recursive: self.realdoi=fdoi # Only find DOI in first time! if (not recursive and fdoi): self.finddoi(1) elif (not recursive and not fdoi): self.finddoi(set([1,2,self.maxpage])) # file doi is shit..Recursively use doi in file or fail if (not fdoi and not recursive): if (len(self.doi) is 1 or len(self.doi) is 2): print "Origin fdoi wrong but has 1~2 dois in file:",self._fname, return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck) # No doi or >2 dois in file else: if not justcheck: self.moveresult(4,printstr="Error fdoi and 0/too much doi. (Fail): "+self._fname) return 4 elif (not fdoi and recursive): print "doi (in recursion) may wrong with error doi. Should never happen.." return 4 # Fail # fdoi is ok cr=CRrecord() try: cr=cr.valid_doi(fdoi,fullparse=True) except requests.exceptions.RequestException as e: print e cr=None except Exception as e: print e cr=None # Error when year=None, improve in crrecord. #if (cr and not cr.year): # cr.year='8888' #crossref is ok if (fdoi and cr): totalpagenumber=1 try: totalpagenumber=self.totalpages(cr.pages) except ValueError as e: # should never happen now print e, cr.pages totalpagewrong=False #print "pages:",self.maxpage,' in crossref:',cr.pages,totalpagenumber if totalpagenumber>0 and not (self.maxpage >= totalpagenumber and self.maxpage <= totalpagenumber+2): totalpagewrong=True # When paper with supporting information if (self.maxpage > totalpagenumber+2): self.finddoi(page=2) if (self.withSI or (self.findtext('Supporting Information', page=[totalpagenumber+1,totalpagenumber+2]) and self.findtext(cr.title, similarity=0.75, page=[totalpagenumber+1,totalpagenumber+2]))): if not recursive : self.finddoi(totalpagenumber); self.withSI=True totalpagewrong=False # For NIH Public Access elif (self.hascontent("NIH Public Access")[0]): totalpagewrong=False #Such as some Nature with SI in paper without notify. elif (self.withSI or (totalpagenumber>1 and self.findtext("acknowledgment", page=[totalpagenumber-1, totalpagenumber]) and self.findtext("reference", page=[totalpagenumber-1, totalpagenumber]))): self.withSI=True totalpagewrong=False # Recursive but total page wrong. Fast end recursivedoicheck if (totalpagewrong and recursive): return 4 # Just check first page, not find(find before..), faster: doivalid=self.checkdoi(fdoi,page=1,iterfind=False,justcheck=True) titleeval=self.checktitle(cr.title) if (totalpagenumber > 0 and not totalpagewrong): if (doivalid and titleeval[0] and len(self.doi) is 1): # Yes! Very Good PDF! self.realdoi=fdoi if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 # Further check doi in page2/last, Finally, will check 1,2 and last pages. if (recursive): doivalid= ( self.checkdoi(fdoi,page=2,iterfind=True,justcheck=True) or doivalid ) else: doivalid= ( self.checkdoi(fdoi,page=2,iterfind=True) or doivalid ) if len(self.doi)>3: # Too much doi may be some abstract self.moveresult(2,printstr='Has more than 3 dois! (Unsure):'+self._fname) return 2 # Page wrong and try recursive use doi if (totalpagewrong): if (len(self.doi) is 1 or len(self.doi) is 2): doi=DOI(list(self.doi)[0]) # DOI in file is same so error. Don't need recursive if (len(self.doi) is 1 and doi == fdoi): if not justcheck: self.moveresult(4,printstr="PDF Page "+str(self.maxpage)+"!="+str(totalpagenumber)+"(Fail): "+self._fname) return 4 print 'Wrong total page with dois in file,',self._fname,fdoi,',try recursive' return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck) else: if not justcheck: self.moveresult(4,printstr="PDF Page "+str(self.maxpage)+"!="+str(totalpagenumber)+"(Fail): "+self._fname) return 4 if (not totalpagewrong): crscore=self.scorefitting(cr) if (self.maxpage <= totalpagenumber+2): # Maybe check when maxpage >total+2 titleeval=self.checktitle(cr.title) if cr.title.strip()=="": titleeval=(False,0.9) titlevalid=titleeval[0] try: paperyear=int(cr.year) except: paperyear=9999 try: # Too old maybe lost information if (paperyear>1990): titlevalid=titlevalid or (titleeval[1]*wtitle+crscore['total'])>=cutoff else: titlevalid=titlevalid or (titleeval[1]*wtitle+crscore['total'])>=cutoff-0.1 #(self.checktitle(cr.title,similarity=0.85) and self.checkcrossref(cr)) except Exception as e: print e if (doivalid): if (titlevalid): # Yes! Good PDF! self.realdoi=fdoi if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and len(self.doi) is 1 and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 print "Title/Paper score:",titleeval[1],crscore,self._fname if (len(self.doi - set([fdoi])) == 1 and not recursive): # Try one more newresult = self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=True) if (newresult is 0): newdoi=DOI(list(self.doi - set([fdoi]))[0]) self.realdoi=newdoi print if not justcheck: self.moveresult(0, printstr="(Rename)fdoi ok, but not title. In file doi "+newdoi+" is better for "+self._fname, newfname=newdoi.quote()+".pdf") return 0 # Else DOI ok but not title if not justcheck: self.moveresult(3,printstr="OK fdoi but not title(Untitle): "+self._fname) return 3 # Indeed, doi maybe in pdf, but strange format.. if (self.checkdoinormaltxt(fdoi)): if (titlevalid): # Further check only when title OK if (self.checkdoifurther(fdoi)): # Fine! move to Done dir if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and len(self.doi) is 1 and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 else: # Can't find, but high similar! move to High dir if not justcheck: self.moveresult(1,printstr="OK title and nospacebreak doi,but not pass(High): "+self._fname) return 1 else: # DOI ok but not title print "Title/Paper score:",titleeval[1],crscore,self._fname if not justcheck: self.moveresult(3,printstr="Maybe OK fdoi but not title(Untitle): "+self._fname) return 3 # DOI maybe not exist .... if (titlevalid): tmpdois=set(self.doi) for d in tmpdois: dd=DOI(d) if ( not dd.valid_doiorg(geturl=False) ): self.doi.remove(d) # Old paper don't have doi... if len(self.doi) is 0 and totalpagenumber>0: if (crscore['total'] >= 0.4): if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 elif (titleeval[1]>=0.85 and crscore['total'] >= 0.35): if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 elif (titleeval[1]>=0.95 and crscore['total'] >=0.3): if not justcheck: if (self.maxpage>=2 and self.maxpage == totalpagenumber and not self.findtext('Supporting Information', page=[1])): self.moveresult(0,good=True) else: self.moveresult(0) return 0 elif (titleeval[1]>=0.90 and crscore['pages']>=0.9 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9)): if not justcheck: self.moveresult(0) return 0 elif (titleeval[1]>=0.90 and crscore['pages']>=0.5 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9) and crscore['authors']>=0.7): if not justcheck: self.moveresult(0) return 0 elif (titleeval[1]>=0.75 or crscore['total'] >=0.25): print "Title/Paper score:",titleeval[1],crscore,self._fname if not justcheck: self.moveresult(1,printstr="OK title and high info fit. But no doi(Highly): "+self._fname) return 1 else: print "Title/Paper score:",titleeval[1],crscore,self._fname if not justcheck: self.moveresult(2,printstr="OK title and ok info fit. But no doi(Unsure): "+self._fname) return 2 elif len(self.doi) is 0 and totalpagenumber== -1: if (titleeval[1]>=0.90 and crscore['pages']>=0.5 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9) and crscore['authors']>=0.7): if not justcheck: self.moveresult(0) return 0 else: print "Title/Paper score:",titleeval[1],crscore,self._fname if not justcheck: self.moveresult(2,printstr="OK title and high info fit. But no doi and no total pages(Unsure): "+self._fname) return 2 elif len(self.doi) is 0 and totalpagenumber<=0: print "Title/Paper score:",titleeval[1],crscore,self._fname if not justcheck: self.moveresult(2,printstr="OK title and high info fit. But no doi and no total pages(Unsure): "+self._fname) return 2 elif ( len(self.doi) > 0 and not recursive): print "Good title but file doesn't contain fdoi, however it has >0 doi in file. " outnow=self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=True) if outnow > 0: if not justcheck: self.moveresult(2,printstr="OK title but not fdoi. In file doi is not good(Unsure): "+self._fname) return 2 elif(outnow==0): print 'Good Title but Fail fdoi. Paper has good in file doi,',self._fname,',try recursive' return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck) ### Old method check old items: #if (self.checkcrossref(cr)): # if (int(cr.year)<=1999 and len(self.doi) is 0): # # Highly possible right # if not justcheck: self.movetodir("High") # return True # Bentham, often blank doi # elif (fdoi[:8] == '10.2174/' and len(self.doi) is 0): # if not justcheck: self.movetodir("Done") # return True # elif (len(self.doi) is 0): # print "Title/Paper score:",titleeval[1],crscore,self._fname # if not justcheck: # self.moveresult(1,printstr="OK title and high info fit. But no doi(Highly): "+self._fname) # return 1 # else: # if not justcheck: # self.moveresult(2,printstr="OK title and high info fit. But doi exist not fit(Unsure): "+self._fname) # return 2 #elif(len(self.doi) is 0): # # Maybe wrong file and no doi # if not justcheck: # self.moveresult(2,printstr="Not found doi in file but ok title (Unsure): "+self._fname) # return 2 #fdoi,title wrong, no doi in file # Or in recursive mode if (len(self.doi) is 0 or recursive): if not justcheck: self.moveresult(4,printstr="Both fdoi and title wrong, no doi in file(Fail): "+self._fname) return 4 # Indeed, file has only one more doi, not the same to fname if (len(self.doi - set([fdoi])) is 1 ): print 'Fail fdoi/title. Paper with one more doi in file,',self._fname,',try recursive' return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck) elif(len(self.doi) > 1): if not justcheck: self.moveresult(4,printstr="fdoi/title fail. Too much infile doi(Fail): "+self._fname) return 4 else: if not justcheck: self.moveresult(4,printstr="What????? What?????(Fail):"+self._fname) return 4 # not cr else: if (not recursive): self.finddoi(set([1,2,self.maxpage])) if (len(self.doi) is 1 or len(self.doi) is 2): print 'Error DOI filename,',self._fname,',try recursive' return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck) if not justcheck: self.moveresult(6,"Error DOI fname(Fail):"+self._fname) return 6