def finddoi(self,page=1): '''Find doi in given page number If page<=0, find all page; >0 single page. If page is list, find page in list''' if not self._fname: print "File Name Not Set!!!" return "" if (self.maxpage is 0): print 'Error max page 0 for '+self._fname return "" if (isinstance(page,(str,float)) ): page=int(page) if (isinstance(page,int)): if (page <= 0 ): outstr=self.pdfcontextpreprocess(self.handle.GetAllPages(self._fname,fobj=self.fobj)) self.doi.update(self.doiresultprocess(self.pdoi.findall(outstr))) self.normaltxt=normalizeString(outstr).lower().strip().replace(' ','') self.didpage.update(range(1,self.maxpage+1)) # Only valid page if (page>self.maxpage): page=self.maxpage # Only page not process before if (page not in self.didpage): outstr=self.pdfcontextpreprocess(self.handle.GetSinglePage(self._fname,page,fobj=self.fobj)) self.doi.update(self.doiresultprocess(self.pdoi.findall(outstr))) self.normaltxt[page-1]=normalizeString(outstr).lower().strip().replace(' ','') self.didpage.add(page) elif ( isinstance(page,(list,tuple,set))): for i in page: self.finddoi(i)
def finddoi(self, num, prefix='', issn=''): title = self.gettitle(num) doi = DOI(self.getdoi(num)) if (not prefix): prefix = doi.split('/', 1)[0] if doi else "" volume = self.getvolume(num) journal = self.getjournalfull(num) year = self.getyear(num) pages = self.getpages(num) self.cr = CRrecord() try: # The origin doi maybe true. Find in crossref if (doi and self.cr.getfromdoi(doi, fullparse=False) and self.cr.doi): # Further check title if (strdiff(doi,self.cr.doi)>=0.85 and \ strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75): return doi if (volume and pages): ops = pages.split('-') crps = self.cr.pages.split('-') if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0] and volume == self.cr.volume): return doi if (year and pages): ops = pages.split('-') crps = self.cr.pages.split('-') if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0] and year == self.cr.year): return doi print "Origin DOI:", doi, "may be true but record strange..Try title" keyword = title + " " + journal + " " + year + " " + pages + " " + volume if (self.cr.getfromtitledoi(keyword, doi, year=year, limit=10, fullparse=False, prefix=prefix)): if (doi): if (prefix == self.cr.doi.split('/')[0] and strdiff(doi, self.cr.doi) >= 0.85): return self.cr.doi else: print "Error for origin doi: " + doi + "; found: " + self.cr.doi return "" return self.cr.doi if (doi): if (strdiff(doi, self.cr.doi) >= 0.85): return self.cr.doi else: print "Error2 for origin doi: " + doi + "; found: " + self.cr.doi return "" else: return "" except Exception as e: print "Error when find doi..", e, "\nRetry..." return self.finddoi(num, prefix=prefix, issn=issn)
def finddoi(self,num,prefix='',issn=''): title=self.gettitle(num) doi=DOI(self.getdoi(num)) if (not prefix): prefix = doi.split('/',1)[0] if doi else "" volume= self.getvolume(num) journal=self.getjournalfull(num) year=self.getyear(num) pages=self.getpages(num) self.cr=CRrecord() try: # The origin doi maybe true. Find in crossref if ( doi and self.cr.getfromdoi(doi,fullparse=False) and self.cr.doi): # Further check title if (strdiff(doi,self.cr.doi)>=0.85 and \ strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75): return doi if( volume and pages ): ops=pages.split('-') crps=self.cr.pages.split('-') if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and volume==self.cr.volume): return doi if( year and pages ): ops=pages.split('-') crps=self.cr.pages.split('-') if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and year==self.cr.year): return doi print "Origin DOI:",doi,"may be true but record strange..Try title" keyword=title+" "+journal+" "+year+" "+pages+" "+volume if (self.cr.getfromtitledoi(keyword,doi,year=year,limit=10,fullparse=False,prefix=prefix)): if (doi): if( prefix == self.cr.doi.split('/')[0] and strdiff(doi,self.cr.doi)>=0.85): return self.cr.doi else: print "Error for origin doi: "+doi+"; found: "+self.cr.doi return "" return self.cr.doi if (doi): if( strdiff(doi,self.cr.doi)>=0.85): return self.cr.doi else: print "Error2 for origin doi: "+doi+"; found: "+self.cr.doi return "" else: return "" except Exception as e: print "Error when find doi..",e,"\nRetry..." return self.finddoi(num,prefix=prefix,issn=issn)
def findtext(self,text,similarity=0.95, page=1): '''Just Find text in Page, Don't search doi and save it''' if not self._fname: print "File Name Not Set!!!" return "" if (self.maxpage is 0): print 'Error max page 0 for '+self._fname return "" if (isinstance(page,(str,float)) ): page=int(page) normaltxt="" if (isinstance(page,int)): if (page<=0): page=1 # Only valid page if (page>self.maxpage): page=self.maxpage # Only page not process before if (not self.normaltxt[page-1] ): outstr=self.pdfcontextpreprocess(self.handle.GetSinglePage(self._fname,page,fobj=self.fobj)) self.normaltxt[page-1]=normalizeString(outstr).lower().strip().replace(' ','') return self.hascontent(text, similarity=similarity, page=page)[0] elif ( isinstance(page,(list,tuple,set))): outyn=False for i in page: outyn= self.findtext(text,similarity=similarity,page=i) if (outyn): break return outyn
def tryrenamefromtitle(self,fname=None,cutoff=0.85,fontsize=0,autotry=False,wtitle=0.65): if not fname: fname=self._fname if (not fname or (fname == "None")): print "No file name is set!" return 0 outstr=self.getbigtitle(fname=fname,cutoff=cutoff,fontsize=fontsize,autotry=autotry).lower().strip() print outstr url="http://api.crossref.org/works?query="+normalizeString(outstr)+"&rows=5" r=requests.get(url,timeout=TIMEOUT_SETTING) dois=[] if (r.status_code is 200): datas=r.json().get('message',{'items':[]}).get('items',[]) for data in datas: dois.append(data.get('DOI','')) self.reset(fname) outnow=99999 for doi in dois: print "Try doi:",doi,'for',fname self.doi=set([doi]) out=self.renamecheck(fname=fname, fobj=None ,wtitle=wtitle,cutoff=cutoff, justcheck=False,fdoi=None,resetfile=False,excludedoi=None) if out ==0: break elif outnow>out: outnow=out
def getbigtitle(self,fname=None,cutoff=0.85,fontsize=0,autotry=False): '''Get the title or big font context''' if not fname: fname=self._fname if (not fname): print "No file name is set!" return "" s=self.handle.GetPages(fname,pagenos=[1,2,3],html=True,fobj=self.fobj) self.handle.reset(html=True) result="" if autotry: for i in range(19): cutoffnow=1.0-0.05*(i+1) result=normalizeString(fontsizestr(s,cutoff=cutoffnow)) if (len(result)> 10): break else: result=normalizeString(fontsizestr(s,cutoff=cutoff,fontsize=fontsize)) return result
def hascontent(self,text, similarity=0.95,page=None,algorithm=2): '''Normalize text and find it in normalized pdf content found before. Normal use algorithm 2, for title use algorithm 3''' if not self._fname: print "File Name Not Set!!!" return (False,0.0) text=normalizeString(text).lower().strip().replace(' ','') if (not text): return (False,0.0) if (len(text)<3): return (False,0.0) try: #Check all parse before if (not page or (isinstance(page,int) and (page>self.maxpage or page<=0))): if (len(text)==3): perfect=text in ''.join(self.normaltxt) return (perfect,float(perfect)/2) if (similarity<1.0): #print text,''.join(self.normaltxt) sim=strsimilarity(''.join(self.normaltxt),text,algorithm=algorithm) return (sim >= similarity,sim) else: perfect=text in ''.join(self.normaltxt) return (perfect,float(perfect)) elif (isinstance(page,int)): if (len(text)==3): perfect=text in self.normaltxt[page-1] return (perfect,float(perfect)/2) if (similarity<1.0): #print text,self.normaltxt[page-1] sim=strsimilarity(self.normaltxt[page-1],text,algorithm=algorithm) return (sim >= similarity,sim) else: perfect=text in self.normaltxt[page-1] return (perfect,float(perfect)) except: print "Something error for hascontent function: "+text return (False,0.0)
def getfromtitledoi( self, title, doi, year="", volume="", issue="", pages="", limit=3, offset=0, cutoff=0.1, fullparse=True, ignorecheminfo=True, prefix="", issn="", ): """Get information from journal title and doi, better with year, volume, issue, pages information""" # Over max records try if offset > limit: return False # Cancel ISSN check because unreliable # search url if issn and len(issn.strip()) is 9: url = ( "http://api.crossref.org/journals/" + issn + "/works?query=" + normalizeString(title) + "&rows=1&offset=" + str(offset) ) elif prefix: url = ( "http://api.crossref.org/prefixes/" + prefix + "/works?query=" + normalizeString(title) + "&rows=1&offset=" + str(offset) ) else: url = "http://api.crossref.org/works?query=" + normalizeString(title) + "&rows=1&offset=" + str(offset) if year: # some time year maybe +- 1 url += "&filter=from-pub-date:" + str(int(year) - 1) + "-06,until-pub-date:" + str(int(year) + 1) + "-06" # print url # search crossref r = requests.get(url, timeout=timeout_setting) if r.status_code is 200: try: for currentrecord in range(len(r.json()["message"]["items"])): data = r.json()["message"]["items"][currentrecord] # should better then cutoff if float(data["score"]) > cutoff: self.title = data.get("title", [""])[0] self.year = str(data["issued"]["date-parts"][0][0]) self.volume = data.get("volume", "") self.issue = data.get("issue", "") self.pages = data.get("page", "") self.doi = data.get("DOI", "") if fullparse: self.journals = data.get("container-title", [""]) self.issns = data.get("ISSN", [""]) if len(self.journals) >= 1: self.journal = self.journals[0] else: self.journal = "" if len(self.issns) >= 1: self.issn = self.issns[0] else: self.issn = "" self.authors = self._getauthor(data.get("author", [])) self.urls = [data.get("URL", "")] if doi.strip(): if strdiff(doi.strip(), self.doi) >= 0.85: return True # else blank # check whether fitting to giving parameters if year and year.strip() != self.year.strip(): # possible +- 1year if not (abs(int(year) - int(self.year)) is 1 and volume.strip() == self.volume.strip()): continue if volume and volume.strip() != self.volume.strip(): continue if pages and pages.strip().split("-")[0] != self.pages.strip().split("-")[0]: continue if ignorecheminfo and data.get("container-title", [""])[0].lower() == "cheminform": continue return True # Low score, more try. else: continue return False except: print "Something error for finding " + title.encode("utf-8") return False else: print "Journal title can't be found: " + title.encode("utf-8") return False
def checkdoinormaltxt(self,doi): '''Check nospace normalstring doi whether in nospace normalstring context''' ndoi=normalizeString(doi).lower().replace(" ",'') return (ndoi in ''.join(self.normaltxt))
def getfromtitledoi(self,title,doi, year="",volume="",issue="",pages="", \ limit=3, offset=0, cutoff=0.1, fullparse=True,ignorecheminfo=True,prefix="",issn=""): '''Get information from journal title and doi, better with year, volume, issue, pages information''' # Over max records try if (offset > limit): return False # Cancel ISSN check because unreliable # search url if (issn and len(issn.strip()) is 9): url = "http://api.crossref.org/journals/" + issn + "/works?query=" + normalizeString( title) + "&rows=1&offset=" + str(offset) elif (prefix): url = "http://api.crossref.org/prefixes/" + prefix + "/works?query=" + normalizeString( title) + "&rows=1&offset=" + str(offset) else: url = "http://api.crossref.org/works?query=" + normalizeString( title) + "&rows=1&offset=" + str(offset) if (year): #some time year maybe +- 1 url += "&filter=from-pub-date:" + str(int( year) - 1) + "-06,until-pub-date:" + str(int(year) + 1) + "-06" #print url # search crossref r = requests.get(url, timeout=timeout_setting) if (r.status_code is 200): try: for currentrecord in range(len(r.json()['message']['items'])): data = r.json()['message']['items'][currentrecord] # should better then cutoff if (float(data['score']) > cutoff): self.title = data.get('title', [''])[0] self.year = str(data['issued']['date-parts'][0][0]) self.volume = data.get('volume', '') self.issue = data.get('issue', '') self.pages = data.get('page', '') self.doi = data.get('DOI', '') if (fullparse): self.journals = data.get('container-title', ['']) self.issns = data.get('ISSN', ['']) if (len(self.journals) >= 1): self.journal = self.journals[0] else: self.journal = "" if (len(self.issns) >= 1): self.issn = self.issns[0] else: self.issn = "" self.authors = self._getauthor( data.get('author', [])) self.urls = [data.get('URL', '')] if (doi.strip()): if (strdiff(doi.strip(), self.doi) >= 0.85): return True #else blank # check whether fitting to giving parameters if (year and year.strip() != self.year.strip()): # possible +- 1year if not (abs(int(year) - int(self.year)) is 1 and volume.strip() == self.volume.strip()): continue if (volume and volume.strip() != self.volume.strip()): continue if (pages and pages.strip().split('-')[0] != self.pages.strip().split('-')[0]): continue if (ignorecheminfo and data.get('container-title', [''])[0].lower() == "cheminform"): continue return True # Low score, more try. else: continue return False except: print "Something error for finding " + title.encode('utf-8') return False else: print "Journal title can't be found: " + title.encode('utf-8') return False