예제 #1
0
	def grepBingAcadPDFbyID(self,bid,maxpage=1,printyn=True):
		'''Grep at most maxpage pages pdf for given bing id
		Save to doi style based on refering to crossref.'''
		if (printyn):
			print "###  ###  ###  ###  ###  ###  ###  ###  ### "
			print "## Finding for "+bid+"...."
		cr=CRrecord()
		ref=self.bidref(bid)
		if (printyn):
			print ref
		if (os.path.exists(bid+".pdf")):
			print "Exist file:"+bid+".pdf"
			return
		if ref['title']:
			if (cr.getfromtitle(title=ref['title'],year=ref['year'],volume=ref['volume'],
					pages=ref['pages'],issue=ref['issue'],fullparse=False) and cr.doi):
				# try to find by title, if found (true):
				if (printyn): print cr
				outname=quotefileDOI(cr.doi)+".pdf"
				if (not os.path.exists(outname)):
					if (self.getbidpdf(bid,filename=outname,printyn=printyn)):
						print "Have Found PDF file: "+outname
				else:
					print "Exist file:"+outname
			else:
				if (self.getbidpdf(bid,filename=bid+".pdf",printyn=printyn)):
					print "Have Found PDF file: "+bid+".pdf"
		else:
			if (self.getbidpdf(bid,filename=bid+".pdf",printyn=printyn)):
				print "Have Found PDF file: "+bid+".pdf"
예제 #2
0
    def finddoi(self, num, prefix='', issn=''):
        title = self.gettitle(num)
        doi = DOI(self.getdoi(num))
        if (not prefix):
            prefix = doi.split('/', 1)[0] if doi else ""
        volume = self.getvolume(num)
        journal = self.getjournalfull(num)
        year = self.getyear(num)
        pages = self.getpages(num)
        self.cr = CRrecord()
        try:
            # The origin doi maybe true. Find in crossref
            if (doi and self.cr.getfromdoi(doi, fullparse=False)
                    and self.cr.doi):
                # Further check title
                if (strdiff(doi,self.cr.doi)>=0.85 and \
                strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75):
                    return doi
                if (volume and pages):
                    ops = pages.split('-')
                    crps = self.cr.pages.split('-')
                    if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0]
                            and volume == self.cr.volume):
                        return doi
                if (year and pages):
                    ops = pages.split('-')
                    crps = self.cr.pages.split('-')
                    if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0]
                            and year == self.cr.year):
                        return doi
                print "Origin DOI:", doi, "may be true but record strange..Try title"

            keyword = title + " " + journal + " " + year + " " + pages + " " + volume
            if (self.cr.getfromtitledoi(keyword,
                                        doi,
                                        year=year,
                                        limit=10,
                                        fullparse=False,
                                        prefix=prefix)):
                if (doi):
                    if (prefix == self.cr.doi.split('/')[0]
                            and strdiff(doi, self.cr.doi) >= 0.85):
                        return self.cr.doi
                    else:
                        print "Error for origin doi: " + doi + "; found: " + self.cr.doi
                        return ""
                return self.cr.doi
            if (doi):
                if (strdiff(doi, self.cr.doi) >= 0.85):
                    return self.cr.doi
                else:
                    print "Error2 for origin doi: " + doi + "; found: " + self.cr.doi
                    return ""
            else:
                return ""
        except Exception as e:
            print "Error when find doi..", e, "\nRetry..."
            return self.finddoi(num, prefix=prefix, issn=issn)
예제 #3
0
	def findcrossreftitledoi(self,doi,printyn=True):
		'''Find doi by crossref first'''
		cr=CRrecord()
		if( cr.getfromdoi(doi,fullparse=False) and cr.doi):
			keyword=(cr.title+" "+cr.doi).encode('utf-8')
			print "#########################################################################"
			print "## Now finding for doi with title: "+ keyword.encode('utf-8')+"............"
			sys.stdout.flush()
			self.grepBingAcadPDF(keyword=keyword,maxpage=1,printyn=printyn)
		else:
			print "Error DOI!: "+doi
		cr.reset()
예제 #4
0
	def getdoi(self,num=0):
		'''Get DOI from Baidu Cite'''
		soup=BeautifulSoup(self.getcite(num,citetype='txt'),"html.parser")
		if (soup.doi): 
			doi=soup.doi.text
		elif(soup.primarytitle):
			cr=CRrecord()
			cr.getfromtitle(soup.primarytitle.info.text,ignorecheminfo=True)
			doi=cr.doi
		else:
			doi=DOI("")
		return DOI(doi[doi.find('10.'):])
예제 #5
0
	def findcrossreftitledoi(self,doi,printyn=True):
		'''Find doi by crossref first'''
		cr=CRrecord()
		if( cr.getfromdoi(doi,fullparse=False) and cr.doi):
			keyword=cr.title+" "+cr.doi
			print "#########################################################################"
			print "## Now finding for doi with title: "+ keyword.encode('utf-8')+"............"
			sys.stdout.flush()
			self.search(keyword=keyword)
			self.getallpdf()
		else:
			print "Error DOI!: "+doi
		cr.reset()
예제 #6
0
	def finddoi(self,num,prefix='',issn=''):
		title=self.gettitle(num)
		doi=DOI(self.getdoi(num))
		if (not prefix):
			prefix = doi.split('/',1)[0] if doi else ""
		volume= self.getvolume(num)
		journal=self.getjournalfull(num)
		year=self.getyear(num) 
		pages=self.getpages(num)
		self.cr=CRrecord()
		try:
			# The origin doi maybe true. Find in crossref
			if ( doi and self.cr.getfromdoi(doi,fullparse=False) and self.cr.doi):
				# Further check title
				if (strdiff(doi,self.cr.doi)>=0.85 and \
				strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75):
					return doi
				if( volume and pages ):
					ops=pages.split('-')
					crps=self.cr.pages.split('-')
					if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and volume==self.cr.volume):
						return doi
				if( year and pages ):
					ops=pages.split('-')
					crps=self.cr.pages.split('-')
					if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and year==self.cr.year):
						return doi
				print "Origin DOI:",doi,"may be true but record strange..Try title"

			keyword=title+" "+journal+" "+year+" "+pages+" "+volume
			if (self.cr.getfromtitledoi(keyword,doi,year=year,limit=10,fullparse=False,prefix=prefix)):
				if (doi):
					if( prefix == self.cr.doi.split('/')[0] and strdiff(doi,self.cr.doi)>=0.85):
						return self.cr.doi
					else:
						print "Error for origin doi: "+doi+"; found: "+self.cr.doi
						return ""
				return self.cr.doi
			if (doi):
				if( strdiff(doi,self.cr.doi)>=0.85):
					return self.cr.doi
				else:
					print "Error2 for origin doi: "+doi+"; found: "+self.cr.doi
					return ""
			else:
				return ""
		except Exception as e:
			print "Error when find doi..",e,"\nRetry..."
			return self.finddoi(num,prefix=prefix,issn=issn)
예제 #7
0
	def findPDFbyISSN(self,issn,maxresult=None, step=100, offset=0):
		'''Find PDF by ISSN based on search result from crossref'''
		# may be improve to not only issn..
		if (not issn):return
		needurl="http://api.crossref.org/journals/"+issn+"/works"
		cr=CRrecord()
		total=cr.gettotalresultfromlink(needurl)
		if (not maxresult or maxresult <=0 or maxresult>total): 
			maxresult=total
		params={"rows":str(step)}
		maxround=(maxresult-offset)/step+1
		offsetcount=offset
		for i in range(maxround):
			params["offset"]=str(step*i+offset)
			r=requests.get(needurl,params,timeout=timeout_setting_download)
			if (r.status_code is 200):
				for j in r.json()['message']['items']:
					keyword=j.get('title',[''])[0]+" "+j.get("DOI","")
					print "#####################################",offsetcount,"####################################"
					print "## Now finding for doi with title: "+ keyword.encode('utf-8')+"............"
					sys.stdout.flush()
					bingacad.grepBingAcadPDF(keyword.encode('utf-8'))
					offsetcount+=1
			gc.collect()
예제 #8
0
class EndnoteXML(object):
    def __init__(self, fname):
        if (fname):
            f = open(fname)
            self.content = re.sub(r'</?style.*?>', '', f.read())
            f.close()
        else:
            self.content = ""
        self.soup = BeautifulSoup(self.content, 'html.parser')
        self.records = self.soup.records.contents
        self.length = len(self.records)

        for i in range(self.length):
            self.checktag(i, 'titles')
            self.checktag(i, 'authors')
            self.checktag(i, 'urls')
            if (self.records[i].find('related-urls') is None):
                self.addtag(i, 'related-urls', '', parent='urls')
            if (self.records[i].find('pdf-urls') is None):
                self.addtag(i, 'pdf-urls', '', parent='urls')
            self.checktag(i, 'dates')
            self.setdoi(i, self.getdoi(i))

    #def __repr__(self):
    #	return self.soup.encode()

    def __str__(self):
        return self.soup.encode()

    def reset(self, fname):
        self.__init__(fname)

    def read(self, fname):
        self.__init__(fname)

    def reads(self, s):
        self.content = s
        self.soup = BeautifulSoup(self.content, 'html.parser')
        self.records = self.soup.records.contents
        self.length = len(self.records)
        for i in range(self.length):
            self.checktag(i, 'titles')
            self.checktag(i, 'authors')
            self.checktag(i, 'urls')
            if (self.records[i].find('related-urls') is None):
                self.addtag(i, 'related-urls', '', parent='urls')
            if (self.records[i].find('pdf-urls') is None):
                self.addtag(i, 'pdf-urls', '', parent='urls')
            self.checktag(i, 'dates')
            self.setdoi(i, self.getdoi(i))

    def writes(self, encoding='utf-8'):
        return self.soup.encode(encoding=encoding)

    def write(self, fname, encoding='utf-8'):
        f = open(fname, 'w')
        f.write(self.writes(encoding=encoding))
        f.close()

    def getrecord(self, num):
        if (num >= self.length):
            return None
        return self.records[num]

    def checktag(self, num, tag):
        if self.records[num].find(tag) is None:
            self.addtag(num, tag, value='')

    def addtag(self, num, tag, value=None, parent=None):
        '''value can be string, tag'''
        a = self.soup.new_tag(tag)
        if value: a.string = value
        if parent:
            self.records[num].find(parent).append(a)
        else:
            self.records[num].append(a)

    def gettag(self, num, tag, parent=None, obj=False):
        if parent:
            if self.records[num].find(parent):
                if self.records[num].find(parent).find(tag):
                    if (obj):
                        return self.records[num].find(parent).find(tag)
                    else:
                        return self.records[num].find(parent).find(tag).string
                else:
                    return ''
            else:
                return ''
        else:
            if self.records[num].find(tag):
                if (obj):
                    return self.records[num].find(tag)
                else:
                    return self.records[num].find(tag).string
            else:
                return ''

    def settag(self, num, tag, value, parent=None):
        if parent:
            if self.records[num].find(parent):
                if self.records[num].find(parent).find(tag):
                    self.records[num].find(parent).find(tag).string = value
                else:
                    self.addtag(num, tag, parent=parent, value=value)
            else:
                a = self.soup.new_tag(tag)
                a.string = value
                self.addtag(num, parent, parent=None, value=a)
        else:
            if self.records[num].find(tag):
                self.records[num].find(tag).string = value
            else:
                self.addtag(num, tag, parent=None, value=value)

    def getpath(self):
        db = self.soup.findChild("database")
        if (db):
            return os.path.splitext(db['path'])[0] + '.Data'
        else:
            return ""

    def getdoi(self, num):
        doistr = self.gettag(num, "electronic-resource-num")
        if (doistr):
            doiindex = doistr.find('10.')
        else:
            doiindex = -1
        if (doiindex >= 0):
            return doistr[doiindex:].lower().strip()
        else:
            return ""

    def setdoi(self, num, value):
        self.settag(num, "electronic-resource-num", value)

    def gettitle(self, num):
        return self.gettag(num, "title")

    def settitle(self, num, value):
        self.settag(num, "title", value)

    def getjournalfull(self, num):
        return self.gettag(num, 'secondary-title')

    def getyear(self, num):
        return self.gettag(num, 'year', 'dates')

    def setyear(self, num, value):
        self.settag(num, 'year', value, 'dates')

    def getvolume(self, num):
        return self.gettag(num, 'volume')

    def setvolume(self, num, value):
        self.settag(num, 'volume', value)

    def getissue(self, num):
        return self.gettag(num, 'number')

    def setissue(self, num, value):
        self.settag(num, 'number', value)

    def getpages(self, num):
        return self.gettag(num, 'pages')

    def setpages(self, num, value):
        self.settag(num, 'pages', value)

    def getnotes(self, num):
        return self.gettag(num, 'notes')

    def setnotes(self, num, value):
        self.settag(num, 'notes', value)

    def geturl(self, num):
        urls = self.gettag(num, 'related-urls', obj=True)
        if (urls):
            return [i.string for i in urls.find_all('url')]
        else:
            return []

    def seturl(self, num, value):
        '''Note that it will clean all the url!'''
        if (self.soup.find('related-urls') is not None):
            urls = self.gettag(num, 'related-urls', obj=True)
            if (urls):
                urls.clear()
        else:
            self.addtag(num, 'related-urls', parent='urls')
        self.addtag(num, 'url', value, 'related-urls')

    def addurl(self, num, value, first=False):
        urls = self.gettag(num, 'related-urls', obj=True)
        a = self.soup.new_tag('url')
        a.string = value
        if (urls):
            if (not first):
                urls.append(a)
            else:
                urls.insert(0, a)
        else:
            self.settag(num, 'related-urls', a, 'urls')

    def getpdf(self, num):
        urls = self.gettag(num, 'pdf-urls', obj=True)
        if (urls):
            return [i.string for i in urls.find_all('url')]
        else:
            return []

    def setpdf(self, num, value):
        '''Note that it will clean all the url!'''
        if (self.soup.find('pdf-urls') is not None):
            urls = self.gettag(num, 'pdf-urls', obj=True)
            if (urls):
                urls.clear()
        else:
            self.addtag(num, 'pdf-urls', parent='urls')
        self.addtag(num, 'url', value, 'pdf-urls')

    def setpdfs(self, num, value):
        '''Note that it will clean all the url!'''
        if (self.soup.find('pdf-urls') is not None):
            urls = self.gettag(num, 'pdf-urls', obj=True)
            if (urls):
                urls.clear()
        else:
            self.addtag(num, 'pdf-urls', parent='urls')
        for url in value:
            self.addtag(num, 'url', url, 'pdf-urls')

    def addpdf(self, num, value, first=False):
        urls = self.gettag(num, 'pdf-urls', obj=True)
        a = self.soup.new_tag('url')
        a.string = value
        if (urls):
            if (not first):
                urls.append(a)
            else:
                urls.insert(0, a)
        else:
            self.addtag(num, 'pdf-urls', a, 'urls')

    def finddoi(self, num, prefix='', issn=''):
        title = self.gettitle(num)
        doi = DOI(self.getdoi(num))
        if (not prefix):
            prefix = doi.split('/', 1)[0] if doi else ""
        volume = self.getvolume(num)
        journal = self.getjournalfull(num)
        year = self.getyear(num)
        pages = self.getpages(num)
        self.cr = CRrecord()
        try:
            # The origin doi maybe true. Find in crossref
            if (doi and self.cr.getfromdoi(doi, fullparse=False)
                    and self.cr.doi):
                # Further check title
                if (strdiff(doi,self.cr.doi)>=0.85 and \
                strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75):
                    return doi
                if (volume and pages):
                    ops = pages.split('-')
                    crps = self.cr.pages.split('-')
                    if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0]
                            and volume == self.cr.volume):
                        return doi
                if (year and pages):
                    ops = pages.split('-')
                    crps = self.cr.pages.split('-')
                    if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0]
                            and year == self.cr.year):
                        return doi
                print "Origin DOI:", doi, "may be true but record strange..Try title"

            keyword = title + " " + journal + " " + year + " " + pages + " " + volume
            if (self.cr.getfromtitledoi(keyword,
                                        doi,
                                        year=year,
                                        limit=10,
                                        fullparse=False,
                                        prefix=prefix)):
                if (doi):
                    if (prefix == self.cr.doi.split('/')[0]
                            and strdiff(doi, self.cr.doi) >= 0.85):
                        return self.cr.doi
                    else:
                        print "Error for origin doi: " + doi + "; found: " + self.cr.doi
                        return ""
                return self.cr.doi
            if (doi):
                if (strdiff(doi, self.cr.doi) >= 0.85):
                    return self.cr.doi
                else:
                    print "Error2 for origin doi: " + doi + "; found: " + self.cr.doi
                    return ""
            else:
                return ""
        except Exception as e:
            print "Error when find doi..", e, "\nRetry..."
            return self.finddoi(num, prefix=prefix, issn=issn)

    def preprocess(self):
        pass

    def cleannote(self, num):
        note = self.getnotes(num)
        notel = note.lower()
        if ("time" in notel):
            self.setnotes(num, notel[notel.find('time'):])

    def cleanallpdf(self, exceptOAPDF=True):
        '''Clean PDF record or except OAPDF record'''
        for i in range(self.length):
            if (not exceptOAPDF):
                self.setpdf(i, '')
            else:
                for pdf in self.getpdf(i):
                    if "internal-pdf://OAPDF/" in pdf:
                        self.setpdf(i, pdf)
                        break

    def process(self, fname="", cleannote=False, prefix='', issn='', start=0):
        epath = self.getpath()
        print "Output", self.length, "to", epath + os.sep + fname
        for i in range(start, self.length):
            try:
                #if (i%100 is 0):
                #	print
                #	print "Doing:",i+1,
                #else:
                #	print i+1,

                pdfs = self.getpdf(i)
                urls = self.geturl(i)
                # Fast consider as record process before
                hasfound = False
                for pdf in pdfs:
                    if "internal-pdf://OAPDF/" in pdf:
                        hasfound = True
                        doistr = self.gettag(i, "electronic-resource-num")
                        if (doistr and len(doistr) > 4
                                and doistr[:4] == 'chk:'):
                            doi = DOI(self.getdoi(i))
                            if doi:
                                self.setdoi(i, "chk: " + doi)
                        break

                if not hasfound:
                    for url in urls:
                        if "http://oapdf.sourceforge.net/cgi-bin/" in url:
                            hasfound = True
                            doistr = self.gettag(i, "electronic-resource-num")
                            if (doistr and len(doistr) > 4
                                    and doistr[:4] == 'chk:'):
                                doi = DOI(self.getdoi(i))
                                if doi:
                                    self.setdoi(i, "chk: " + doi)
                            break
                if hasfound:
                    continue

                if (cleannote):
                    self.cleannote(i)

                doistr = self.gettag(i, "electronic-resource-num")
                if (doistr and len(doistr) > 4 and doistr[:4] == 'chk:'):
                    doi = DOI(self.getdoi(i))
                else:
                    doi = DOI(self.finddoi(i, prefix=prefix, issn=issn))
                    if doi:
                        self.setdoi(i, "chk: " + doi)
                oapdflink = ""
                if (doi and doi.is_oapdf()):
                    oapdflink = "http://oapdf.sourceforge.net/cgi-bin/doipage.cgi?doi=" + doi

                newpdfs = []
                for pdf in pdfs:
                    pdfpath = pdf.replace("internal-pdf://",
                                          epath + os.sep + "PDF" + os.sep)
                    relpath = pdf.replace("internal-pdf://", "")
                    # should never happen
                    if (relpath == doi.quote() + ".pdf"):
                        newpdfs.append(pdf)
                        continue
                    if (doi):
                        if (os.path.exists(pdfpath)):
                            try:
                                os.renames(
                                    pdfpath, epath + os.sep + "PDF" + os.sep +
                                    doi.quote() + ".pdf")
                                newpdfs.append("internal-pdf://" +
                                               doi.quote() + ".pdf")
                            except:
                                print "Can't rename:", pdf, 'to', doi.quote(
                                ) + ".pdf"
                                newpdfs.append(pdf)
                                continue
                        else:
                            print "Maybe error for the record", doi, "with pdf path:", pdf, '; Try finding..',
                            pdfdir = os.path.split(pdfpath)[0]
                            if (os.path.exists(pdfdir)):
                                fs = glob.glob(pdfdir + os.sep + '*.pdf')
                                if (len(fs) == 1):
                                    try:
                                        os.renames(
                                            fs[0], epath + os.sep + "PDF" +
                                            os.sep + doi.quote() + ".pdf")
                                        newpdfs.append("internal-pdf://" +
                                                       doi.quote() + ".pdf")
                                        print "Find", fs[0], 'and rename!'
                                    except:
                                        print "Can't rename:", fs[
                                            0], 'to', doi.quote() + ".pdf"
                                        newpdfs.append(pdf)
                                        continue
                                else:
                                    print "Can't find.."
                                    newpdfs.append(pdf)
                                    continue
                            else:
                                newpdfs.append(pdf)
                                continue
                    else:
                        print "Blank doi for file:", pdf
                        newpdfs.append(pdf)
                        continue
                if (oapdflink):
                    newpdfs.append("internal-pdf://OAPDF/" + doi.quote() +
                                   ".pdf")
                self.setpdfs(i, newpdfs)
                # Set the urls
                if (oapdflink and oapdflink not in urls):
                    self.addurl(i, oapdflink, first=True)
            except Exception as e:
                print "Error at ", i, 'since: ', e
                #return 1
        if fname:
            self.write(fname)
        return 0
예제 #9
0
class EndnoteXML(object):
	def __init__(self,fname):
		if (fname):
			f=open(fname)
			self.content=re.sub(r'</?style.*?>','',f.read())
			f.close()
		else:
			self.content=""
		self.soup=BeautifulSoup(self.content,'html.parser')
		self.records=self.soup.records.contents
		self.length=len(self.records)
		
		for i in range(self.length):
			self.checktag(i,'titles')
			self.checktag(i,'authors')
			self.checktag(i,'urls')
			if (self.records[i].find('related-urls') is None):
				self.addtag(i,'related-urls','',parent='urls')
			if (self.records[i].find('pdf-urls') is None):
				self.addtag(i,'pdf-urls','',parent='urls')			
			self.checktag(i,'dates')
			self.setdoi(i,self.getdoi(i))

	#def __repr__(self):
	#	return self.soup.encode()

	def __str__(self):
		return self.soup.encode()

	def reset(self,fname):
		self.__init__(fname)

	def read(self,fname):
		self.__init__(fname)

	def reads(self,s):
		self.content=s
		self.soup=BeautifulSoup(self.content,'html.parser')
		self.records=self.soup.records.contents
		self.length=len(self.records)
		for i in range(self.length):
			self.checktag(i,'titles')
			self.checktag(i,'authors')
			self.checktag(i,'urls')
			if (self.records[i].find('related-urls') is None):
				self.addtag(i,'related-urls','',parent='urls')
			if (self.records[i].find('pdf-urls') is None):
				self.addtag(i,'pdf-urls','',parent='urls')
			self.checktag(i,'dates')
			self.setdoi(i,self.getdoi(i))

	def writes(self,encoding='utf-8'):
		return self.soup.encode(encoding=encoding)

	def write(self,fname,encoding='utf-8'):
		f=open(fname,'w')
		f.write(self.writes(encoding=encoding))
		f.close()

	def getrecord(self,num):
		if (num>=self.length):
			return None
		return self.records[num]

	def checktag(self,num,tag):
		if self.records[num].find(tag) is None:
			self.addtag(num,tag,value='')

	def addtag(self,num,tag,value=None,parent=None):
		'''value can be string, tag'''
		a=self.soup.new_tag(tag)
		if value: a.string=value
		if parent:
			self.records[num].find(parent).append(a)
		else:
			self.records[num].append(a)

	def gettag(self,num,tag,parent=None,obj=False):
		if parent:
			if self.records[num].find(parent):
				if self.records[num].find(parent).find(tag):
					if (obj):
						return self.records[num].find(parent).find(tag)
					else:
						return self.records[num].find(parent).find(tag).string
				else:
					return ''
			else:
				return ''
		else:
			if self.records[num].find(tag):
				if (obj):
					return self.records[num].find(tag)
				else:
					return self.records[num].find(tag).string
			else:
				return ''

	def settag(self,num,tag,value,parent=None):
		if parent:
			if self.records[num].find(parent):
				if self.records[num].find(parent).find(tag):
					self.records[num].find(parent).find(tag).string=value
				else:
					self.addtag(num,tag,parent=parent,value=value)
			else:
				a=self.soup.new_tag(tag)
				a.string=value
				self.addtag(num,parent,parent=None,value=a)
		else:
			if self.records[num].find(tag):
				self.records[num].find(tag).string=value
			else:
				self.addtag(num,tag,parent=None,value=value)	

	def getpath(self):
		db=self.soup.findChild("database")
		if (db):
			return os.path.splitext(db['path'])[0]+'.Data'
		else:
			return ""

	def getdoi(self,num):
		doistr=self.gettag(num,"electronic-resource-num")
		if (doistr):
			doiindex=doistr.find('10.')
		else:
			doiindex=-1
		if (doiindex >=0):
			return doistr[doiindex:].lower().strip()
		else:
			return ""

	def setdoi(self,num,value):
		self.settag(num,"electronic-resource-num",value)

	def gettitle(self,num):
		return self.gettag(num,"title")

	def settitle(self,num,value):
		self.settag(num,"title",value)

	def getjournalfull(self,num):
		return self.gettag(num,'secondary-title')

	def getyear(self,num):
		return self.gettag(num,'year','dates')

	def setyear(self,num,value):
		self.settag(num,'year',value,'dates')

	def getvolume(self,num):
		return self.gettag(num,'volume')

	def setvolume(self,num,value):
		self.settag(num,'volume',value)

	def getissue(self,num):
		return self.gettag(num,'number')

	def setissue(self,num,value):
		self.settag(num,'number',value)

	def getpages(self,num):
		return self.gettag(num,'pages')

	def setpages(self,num,value):
		self.settag(num,'pages',value)

	def getnotes(self,num):
		return self.gettag(num,'notes')

	def setnotes(self,num,value):
		self.settag(num,'notes',value)

	def geturl(self,num):
		urls=self.gettag(num,'related-urls',obj=True)
		if (urls):
			return [ i.string for i in urls.find_all('url') ]
		else:
			return []

	def seturl(self,num,value):
		'''Note that it will clean all the url!'''
		if (self.soup.find('related-urls') is not None):
			urls=self.gettag(num,'related-urls',obj=True)
			if (urls):
				urls.clear()
		else:
			self.addtag(num,'related-urls',parent='urls')
		self.addtag(num,'url',value,'related-urls')

	def addurl(self,num,value,first=False):
		urls=self.gettag(num,'related-urls',obj=True)
		a=self.soup.new_tag('url')
		a.string=value
		if (urls):
			if (not first):
				urls.append(a)
			else:
				urls.insert(0,a)
		else:
			self.settag(num,'related-urls',a,'urls')

	def getpdf(self,num):
		urls=self.gettag(num,'pdf-urls',obj=True)
		if (urls):
			return [ i.string for i in urls.find_all('url') ]
		else:
			return []

	def setpdf(self,num,value):
		'''Note that it will clean all the url!'''
		if (self.soup.find('pdf-urls') is not None):
			urls=self.gettag(num,'pdf-urls',obj=True)
			if (urls):
				urls.clear()
		else:
			self.addtag(num,'pdf-urls',parent='urls')
		self.addtag(num,'url',value,'pdf-urls')

	def setpdfs(self,num,value):
		'''Note that it will clean all the url!'''
		if (self.soup.find('pdf-urls') is not None):
			urls=self.gettag(num,'pdf-urls',obj=True)
			if (urls):
				urls.clear()
		else:
			self.addtag(num,'pdf-urls',parent='urls')
		for url in value:
			self.addtag(num,'url',url,'pdf-urls')

	def addpdf(self,num,value,first=False):
		urls=self.gettag(num,'pdf-urls',obj=True)
		a=self.soup.new_tag('url')
		a.string=value
		if (urls):
			if (not first):
				urls.append(a)
			else:
				urls.insert(0,a)
		else:
			self.addtag(num,'pdf-urls',a,'urls')

	def finddoi(self,num,prefix='',issn=''):
		title=self.gettitle(num)
		doi=DOI(self.getdoi(num))
		if (not prefix):
			prefix = doi.split('/',1)[0] if doi else ""
		volume= self.getvolume(num)
		journal=self.getjournalfull(num)
		year=self.getyear(num) 
		pages=self.getpages(num)
		self.cr=CRrecord()
		try:
			# The origin doi maybe true. Find in crossref
			if ( doi and self.cr.getfromdoi(doi,fullparse=False) and self.cr.doi):
				# Further check title
				if (strdiff(doi,self.cr.doi)>=0.85 and \
				strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75):
					return doi
				if( volume and pages ):
					ops=pages.split('-')
					crps=self.cr.pages.split('-')
					if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and volume==self.cr.volume):
						return doi
				if( year and pages ):
					ops=pages.split('-')
					crps=self.cr.pages.split('-')
					if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and year==self.cr.year):
						return doi
				print "Origin DOI:",doi,"may be true but record strange..Try title"

			keyword=title+" "+journal+" "+year+" "+pages+" "+volume
			if (self.cr.getfromtitledoi(keyword,doi,year=year,limit=10,fullparse=False,prefix=prefix)):
				if (doi):
					if( prefix == self.cr.doi.split('/')[0] and strdiff(doi,self.cr.doi)>=0.85):
						return self.cr.doi
					else:
						print "Error for origin doi: "+doi+"; found: "+self.cr.doi
						return ""
				return self.cr.doi
			if (doi):
				if( strdiff(doi,self.cr.doi)>=0.85):
					return self.cr.doi
				else:
					print "Error2 for origin doi: "+doi+"; found: "+self.cr.doi
					return ""
			else:
				return ""
		except Exception as e:
			print "Error when find doi..",e,"\nRetry..."
			return self.finddoi(num,prefix=prefix,issn=issn)

	def preprocess(self):
		pass

	def cleannote(self,num):
		note=self.getnotes(num)
		notel=note.lower()
		if ("time" in notel):
			self.setnotes(num,notel[notel.find('time'):])

	def cleanallpdf(self,exceptOAPDF=True):
		'''Clean PDF record or except OAPDF record'''
		for i in range(self.length):
			if (not exceptOAPDF):
				self.setpdf(i,'')
			else:
				for pdf in self.getpdf(i):
					if "internal-pdf://OAPDF/" in pdf:
						self.setpdf(i,pdf)
						break

	def process(self,fname="",cleannote=False,prefix='',issn='',start=0):
		epath=self.getpath()
		print "Output",self.length,"to",epath+os.sep+fname
		for i in range(start,self.length):
			try:
				#if (i%100 is 0):
				#	print
				#	print "Doing:",i+1,
				#else:
				#	print i+1,

				pdfs=self.getpdf(i)
				urls=self.geturl(i)
				# Fast consider as record process before
				hasfound=False
				for pdf in pdfs:
					if "internal-pdf://OAPDF/" in pdf:
						hasfound=True
						doistr=self.gettag(i,"electronic-resource-num")
						if (doistr and len(doistr)>4 and doistr[:4]=='chk:'):
							doi=DOI(self.getdoi(i))
							if doi:
								self.setdoi(i,"chk: "+doi)
						break
						
				if not hasfound:
					for url in urls:
						if "http://oapdf.sourceforge.net/cgi-bin/" in url:
							hasfound=True
							doistr=self.gettag(i,"electronic-resource-num")
							if (doistr and len(doistr)>4 and doistr[:4]=='chk:'):
								doi=DOI(self.getdoi(i))
								if doi:
									self.setdoi(i,"chk: "+doi)
							break
				if hasfound:
					continue

				if (cleannote):
					self.cleannote(i)

				doistr=self.gettag(i,"electronic-resource-num")
				if (doistr and len(doistr)>4 and doistr[:4]=='chk:'):
					doi=DOI(self.getdoi(i))
				else:
					doi=DOI(self.finddoi(i,prefix=prefix,issn=issn))
					if doi:
						self.setdoi(i,"chk: "+doi)
				oapdflink=""
				if (doi and doi.is_oapdf()):
					oapdflink="http://oapdf.sourceforge.net/cgi-bin/doipage.cgi?doi="+doi

				newpdfs=[]
				for pdf in pdfs:
					pdfpath=pdf.replace("internal-pdf://",epath+os.sep+"PDF"+os.sep)
					relpath=pdf.replace("internal-pdf://","")
					# should never happen
					if (relpath == doi.quote()+".pdf"):
						newpdfs.append(pdf)
						continue
					if (doi):
						if (os.path.exists(pdfpath)):
							try:
								os.renames(pdfpath,epath+os.sep+"PDF"+os.sep+doi.quote()+".pdf")
								newpdfs.append("internal-pdf://"+doi.quote()+".pdf")
							except:
								print "Can't rename:",pdf,'to',doi.quote()+".pdf"
								newpdfs.append(pdf)
								continue
						else:
							print "Maybe error for the record",doi,"with pdf path:",pdf,'; Try finding..',
							pdfdir=os.path.split(pdfpath)[0]
							if (os.path.exists(pdfdir)):
								fs=glob.glob(pdfdir+os.sep+'*.pdf')
								if (len(fs)==1):
									try:
										os.renames(fs[0],epath+os.sep+"PDF"+os.sep+doi.quote()+".pdf")
										newpdfs.append("internal-pdf://"+doi.quote()+".pdf")
										print "Find",fs[0],'and rename!'
									except:
										print "Can't rename:",fs[0],'to',doi.quote()+".pdf"
										newpdfs.append(pdf)
										continue
								else:
									print "Can't find.."
									newpdfs.append(pdf)
									continue
							else:
								newpdfs.append(pdf)
								continue
					else:
						print "Blank doi for file:",pdf
						newpdfs.append(pdf)
						continue
				if (oapdflink):
					newpdfs.append("internal-pdf://OAPDF/"+doi.quote()+".pdf")
				self.setpdfs(i,newpdfs)
				# Set the urls
				if (oapdflink and oapdflink not in urls):
					self.addurl(i,oapdflink,first=True)
			except Exception as e:
				print "Error at ", i, 'since: ',e
				#return 1
		if fname:
			self.write(fname)
		return 0
예제 #10
0
	def findPDFbyISSN(self,issn,maxresult=None, step=100, offset=0, 
		usedoi=True,doifilter=None,onlinecheck=True,savestate=None,proxy=None,usebdcheck=True):
		'''Find PDF by ISSN based on search result from crossref'''
		# may be improve to not only issn..
		if (not issn):return
		if (len(issn)==9 and issn[4]=='-'):
			needurl="http://api.crossref.org/journals/"+issn+"/works"
		elif('10.' in issn):
			needurl="http://api.crossref.org/prefixes/"+issn+"/works"
		else:
			print "Error ISSN/prefix"
			sys.exit(1)
		cr=CRrecord()
		total=cr.gettotalresultfromlink(needurl)
		if (not maxresult or maxresult <=0 or maxresult>total): 
			maxresult=total
		params={"rows":str(step)}
		maxround=(maxresult-offset)/step+1
		offsetcount=offset
		bdcheck=BDCheck()

		for i in range(maxround):
			params["offset"]=str(step*i+offset)
			r=requests.get(needurl,params,timeout=timeout_setting_download)
			if (r.status_code is 200):
				# Get all check/in oapdf 
				if usebdcheck: 
					bdcheckall=bdcheck.filterdois(r.json(),oapdf=1,crjson=True)

				for j in r.json().get('message',{}).get('items',[]):
					keyword=j.get('title',[''])
					doi=DOI(j.get("DOI",""))
					if not doi:
						offsetcount+=1
						time.sleep(2)
						continue

					# Check whether in bdcheck
					if (usebdcheck and doi in bdcheckall):
						print doi, 'has search/oapdf/free by bdcheck'
						offsetcount+=1
						time.sleep(1)
						continue
						
					# If not in bdcheck, check oapdf/free and set it
					# TODO: remove it after combine oapdf information to library
					oapdffree=bdcheck.setbycheck(doi)
					if (oapdffree[0] or oapdffree[1]):
						print doi,'exist in oapdf/free library..'
						offsetcount+=1
						time.sleep(1)
						continue						

					if (keyword): 
						keyword=keyword[0]
					else:
						time.sleep(2)
						offsetcount+=1
						continue
					if usedoi:keyword+=" "+doi
					print "#####################################",offsetcount,"####################################"
					print "## Now finding for doi with title:"+doi+" "+ keyword.encode('utf-8')+"............"
					sys.stdout.flush()
					self.search(keyword.encode('utf-8'),proxy=proxy)
					bdresult=self.getallpdf(doifilter,onlinecheck=onlinecheck,savestate=savestate,usebdcheck=usebdcheck)
					bdcheck.set(doi)
					offsetcount+=1
			gc.collect()
		print "End of process for",issn
예제 #11
0
	def renamecheck(self,fname,wtitle=0.65,cutoff=0.85,justcheck=False,resetfile=True,fdoi=None,excludedoi=None, fobj=None):
		'''A complex function to get doi from file name, 
		check in crossref, check in pdf file, rename it!
		just check can cancel move file'''
		### Result back:
		# 0: Done 
		# 1: High
		# 2: Unsure
		# 3: Untitle
		# 4: Fail
		# 5: Page0
		# 6: ErrorDOI
		# 10: Unknow


		if (resetfile and isinstance(fobj,(file,StringIO))):
			self.reset(fname="",fobj=fobj)
			fname="None"

		# len(self.doi) is 1 and len(self.doi - excludedoi) is 1 : 
		# :: First Run and perform check
		# len(self.doi) is 1 or len(self.doi - excludedoi) is 1 :
		if (not fname and not fdoi):
			print "No given file name or doi! (Return 6)"
			return 6

		if (fname and not fdoi and excludedoi):
			print "What do you want?! No excludedoi set by user! (Return 9)"
			return 9

		
		if (resetfile and fname !="None"): 
			self.reset(fname)
		elif(resetfile and not isinstance(fobj,(file,StringIO))):
			print "Use reset file but no file name/object is given!"
			return 9

		if (self.maxpage == 0):
			if not justcheck: 
				self.moveresult(5, printstr="Error Page 0 (Page0, R5): "+self._fname)
			return 5

		if (not excludedoi):
			excludedoi=set()

		if (not fdoi):
			#File obj is ""
			fdoi=DOI(os.path.splitext(os.path.basename(self._fname))[0])
		else:
			fdoi=DOI(fdoi)

		recursive= (len(excludedoi) > 0)
		# If in recursive, don't move file!
		if recursive: justcheck=True

		if resetfile and not recursive:
			self.realdoi=fdoi

		# Only find DOI in first time!
		if (not recursive and fdoi): 
			self.finddoi(1)
		elif (not recursive and not fdoi):
			self.finddoi(set([1,2,self.maxpage]))

		# file doi is shit..Recursively use doi in file or fail
		if (not fdoi and not recursive):
			if (len(self.doi) is 1 or len(self.doi) is 2):
				print "Origin fdoi wrong but has 1~2 dois in file:",self._fname,
				return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)
			# No doi or >2 dois in file
			else:
				if not justcheck: 
					self.moveresult(4,printstr="Error fdoi and 0/too much doi. (Fail): "+self._fname)
				return 4
		elif (not fdoi and recursive):
			print "doi (in recursion) may wrong with error doi. Should never happen.."
			return 4 # Fail

		# fdoi is ok
		cr=CRrecord()
		try:
			cr=cr.valid_doi(fdoi,fullparse=True)
		except requests.exceptions.RequestException as e:
			print e
			cr=None
		except Exception as e:
			print e
			cr=None		

		# Error when year=None, improve in crrecord.
		#if (cr and not cr.year):
		#	cr.year='8888'	

		#crossref is ok
		if (fdoi and cr):
			totalpagenumber=1
			try:
				totalpagenumber=self.totalpages(cr.pages)
			except ValueError as e:
				# should never happen now
				print e, cr.pages

			totalpagewrong=False
			#print "pages:",self.maxpage,' in crossref:',cr.pages,totalpagenumber
			if totalpagenumber>0 and not (self.maxpage >= totalpagenumber and self.maxpage <= totalpagenumber+2):
				totalpagewrong=True
				# When paper with supporting information
				if (self.maxpage > totalpagenumber+2):
					self.finddoi(page=2)
					if (self.withSI or (self.findtext('Supporting Information', page=[totalpagenumber+1,totalpagenumber+2])
						and self.findtext(cr.title, similarity=0.75, page=[totalpagenumber+1,totalpagenumber+2]))):
						if not recursive : self.finddoi(totalpagenumber);
						self.withSI=True
						totalpagewrong=False
					# For NIH Public Access
					elif (self.hascontent("NIH Public Access")[0]):
						totalpagewrong=False
					#Such as some Nature with SI in paper without notify.
					elif (self.withSI or (totalpagenumber>1 and self.findtext("acknowledgment", page=[totalpagenumber-1, totalpagenumber]) 
						and self.findtext("reference", page=[totalpagenumber-1, totalpagenumber]))):
						self.withSI=True
						totalpagewrong=False

			# Recursive but total page wrong. Fast end recursivedoicheck
			if (totalpagewrong and recursive):	
				return 4

			# Just check first page, not find(find before..), faster:
			doivalid=self.checkdoi(fdoi,page=1,iterfind=False,justcheck=True)
			titleeval=self.checktitle(cr.title)
			if (totalpagenumber > 0 and not totalpagewrong):
				if (doivalid and titleeval[0] and len(self.doi) is 1):
					# Yes! Very Good PDF!
					self.realdoi=fdoi
					if not justcheck: 
						if (self.maxpage>=2 and self.maxpage == totalpagenumber and
							not self.findtext('Supporting Information', page=[1])):
							self.moveresult(0,good=True)
						else:
							self.moveresult(0)
					return 0

			# Further check doi in page2/last, Finally, will check 1,2 and last pages.
			if (recursive):
				doivalid= ( self.checkdoi(fdoi,page=2,iterfind=True,justcheck=True) or doivalid )
			else:
				doivalid= ( self.checkdoi(fdoi,page=2,iterfind=True) or doivalid )

			if len(self.doi)>3:
				# Too much doi may be some abstract
				self.moveresult(2,printstr='Has more than 3 dois! (Unsure):'+self._fname)
				return 2

			# Page wrong and try recursive use doi
			if (totalpagewrong):
				if (len(self.doi) is 1 or len(self.doi) is 2):
					doi=DOI(list(self.doi)[0])
					# DOI in file is same so error. Don't need recursive
					if (len(self.doi) is 1 and doi == fdoi):
						if not justcheck: 
							self.moveresult(4,printstr="PDF Page "+str(self.maxpage)+"!="+str(totalpagenumber)+"(Fail): "+self._fname)
						return 4

					print 'Wrong total page with dois in file,',self._fname,fdoi,',try recursive'
					return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)
				else:
					if not justcheck: 
						self.moveresult(4,printstr="PDF Page "+str(self.maxpage)+"!="+str(totalpagenumber)+"(Fail): "+self._fname)
					return 4

			if (not totalpagewrong):
				crscore=self.scorefitting(cr)
				if (self.maxpage <= totalpagenumber+2): 
					# Maybe check when maxpage >total+2
					titleeval=self.checktitle(cr.title)
				if cr.title.strip()=="":
					titleeval=(False,0.9)
				titlevalid=titleeval[0]
				try:
					paperyear=int(cr.year)
				except:
					paperyear=9999
				try:
					# Too old maybe lost information
					if (paperyear>1990):
						titlevalid=titlevalid or (titleeval[1]*wtitle+crscore['total'])>=cutoff
					else:
						titlevalid=titlevalid or (titleeval[1]*wtitle+crscore['total'])>=cutoff-0.1
				#(self.checktitle(cr.title,similarity=0.85) and self.checkcrossref(cr))
				except Exception as e:
					print e

				if (doivalid):
					if (titlevalid):					
						# Yes! Good PDF!
						self.realdoi=fdoi
						if not justcheck: 
							if (self.maxpage>=2 and self.maxpage == totalpagenumber 
								and len(self.doi) is 1 and
								not self.findtext('Supporting Information', page=[1])):
								self.moveresult(0,good=True)
							else:
								self.moveresult(0)
						return 0

					print "Title/Paper score:",titleeval[1],crscore,self._fname
					if (len(self.doi - set([fdoi])) == 1 and not recursive):
						
						# Try one more
						newresult = self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=True)
						if (newresult is 0):
							newdoi=DOI(list(self.doi - set([fdoi]))[0])
							self.realdoi=newdoi
							print 
							if not justcheck: self.moveresult(0, 
								printstr="(Rename)fdoi ok, but not title. In file doi "+newdoi+" is better for "+self._fname,
								newfname=newdoi.quote()+".pdf")
							return 0

					# Else DOI ok but not title
					if not justcheck: 
						self.moveresult(3,printstr="OK fdoi but not title(Untitle): "+self._fname)
					return 3
				
				# Indeed, doi maybe in pdf, but strange format..
				if (self.checkdoinormaltxt(fdoi)):
					if (titlevalid):
						# Further check only when title OK
						if (self.checkdoifurther(fdoi)):
							# Fine! move to Done dir
							if not justcheck: 
								if (self.maxpage>=2 and self.maxpage == totalpagenumber 
								and len(self.doi) is 1 and
								not self.findtext('Supporting Information', page=[1])):
									self.moveresult(0,good=True)
								else:
									self.moveresult(0)
							return 0
						else:
							# Can't find, but high similar! move to High dir
							if not justcheck: 
								self.moveresult(1,printstr="OK title and nospacebreak doi,but not pass(High): "+self._fname)
							return 1
					else:
						# DOI ok but not title
						print "Title/Paper score:",titleeval[1],crscore,self._fname
						if not justcheck: 
							self.moveresult(3,printstr="Maybe OK fdoi but not title(Untitle): "+self._fname)
						return 3						

				# DOI maybe not exist ....
				if (titlevalid):
					tmpdois=set(self.doi)
					for d in tmpdois:
						dd=DOI(d)
						if ( not dd.valid_doiorg(geturl=False) ):
							self.doi.remove(d)
							
					# Old paper don't have doi...
					if len(self.doi) is 0 and totalpagenumber>0:
						if (crscore['total'] >= 0.4):
							if not justcheck: 
								if (self.maxpage>=2 and self.maxpage == totalpagenumber and
								not self.findtext('Supporting Information', page=[1])):
									self.moveresult(0,good=True)
								else:
									self.moveresult(0)
							return 0
						elif (titleeval[1]>=0.85 and crscore['total'] >= 0.35):
							if not justcheck: 
								if (self.maxpage>=2 and self.maxpage == totalpagenumber and
								not self.findtext('Supporting Information', page=[1])):
									self.moveresult(0,good=True)
								else:
									self.moveresult(0)								
							return 0
						elif (titleeval[1]>=0.95 and crscore['total'] >=0.3):
							if not justcheck: 
								if (self.maxpage>=2 and self.maxpage == totalpagenumber and
								not self.findtext('Supporting Information', page=[1])):
									self.moveresult(0,good=True)
								else:
									self.moveresult(0)
							return 0
						elif (titleeval[1]>=0.90 and crscore['pages']>=0.9 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9)):
							if not justcheck: self.moveresult(0)
							return 0
						elif (titleeval[1]>=0.90 and crscore['pages']>=0.5 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9) and crscore['authors']>=0.7):
							if not justcheck: self.moveresult(0)
							return 0
						elif (titleeval[1]>=0.75 or crscore['total'] >=0.25):
							print "Title/Paper score:",titleeval[1],crscore,self._fname
							if not justcheck: 
								self.moveresult(1,printstr="OK title and high info fit. But no doi(Highly): "+self._fname)
								return 1
						else:
							print "Title/Paper score:",titleeval[1],crscore,self._fname
							if not justcheck: 
								self.moveresult(2,printstr="OK title and ok info fit. But no doi(Unsure): "+self._fname)
							return 2
					elif len(self.doi) is 0 and totalpagenumber== -1:
						if (titleeval[1]>=0.90 and crscore['pages']>=0.5 and crscore['year']>=0.9 and (crscore['journal']>=0.9 or crscore['issn']>=0.9) and crscore['authors']>=0.7):
							if not justcheck: self.moveresult(0)
							return 0
						else:
							print "Title/Paper score:",titleeval[1],crscore,self._fname
							if not justcheck: 
								self.moveresult(2,printstr="OK title and high info fit. But no doi and no total pages(Unsure): "+self._fname)
							return 2												
					elif len(self.doi) is 0 and totalpagenumber<=0:
						print "Title/Paper score:",titleeval[1],crscore,self._fname
						if not justcheck: 
							self.moveresult(2,printstr="OK title and high info fit. But no doi and no total pages(Unsure): "+self._fname)
						return 2
					elif ( len(self.doi) > 0 and not recursive):
						print "Good title but file doesn't contain fdoi, however it has >0 doi in file. "
						outnow=self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=True)
						if outnow > 0:
							if not justcheck:
								self.moveresult(2,printstr="OK title but not fdoi. In file doi is not good(Unsure): "+self._fname)
							return 2
						elif(outnow==0):
							print 'Good Title but Fail fdoi. Paper has good in file doi,',self._fname,',try recursive'
							return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)

					### Old method check old items:
					#if (self.checkcrossref(cr)):
					#	if (int(cr.year)<=1999 and len(self.doi) is 0):
					#		# Highly possible right
					#		if not justcheck: self.movetodir("High")
					#		return True
					#	  Bentham, often blank doi
					#	elif (fdoi[:8] == '10.2174/' and len(self.doi) is 0):
					#		if not justcheck: self.movetodir("Done")
					#		return True
					#	elif (len(self.doi) is 0):
					#		print "Title/Paper score:",titleeval[1],crscore,self._fname
					#		if not justcheck: 
					#			self.moveresult(1,printstr="OK title and high info fit. But no doi(Highly): "+self._fname)
					#		return 1							
					#	else:
					#		if not justcheck: 
					#			self.moveresult(2,printstr="OK title and high info fit. But doi exist not fit(Unsure): "+self._fname)
					#		return 2								
					#elif(len(self.doi) is 0):
					#	# Maybe wrong file and no doi
					#	if not justcheck: 
					#		self.moveresult(2,printstr="Not found doi in file but ok title (Unsure): "+self._fname)
					#	return 2

			#fdoi,title wrong, no doi in file
			# Or in recursive mode
			if (len(self.doi) is 0 or recursive):
				if not justcheck: 
					self.moveresult(4,printstr="Both fdoi and title wrong, no doi in file(Fail): "+self._fname)
				return 4

			# Indeed, file has only one more doi, not the same to fname
			if (len(self.doi - set([fdoi])) is 1 ):
				print 'Fail fdoi/title. Paper with one more doi in file,',self._fname,',try recursive'
				return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)
			elif(len(self.doi) > 1):
				if not justcheck: 
					self.moveresult(4,printstr="fdoi/title fail. Too much infile doi(Fail): "+self._fname)
				return 4
			else:
				if not justcheck: 
					self.moveresult(4,printstr="What????? What?????(Fail):"+self._fname)
				return 4
		# not cr
		else:
			if (not recursive):
				self.finddoi(set([1,2,self.maxpage]))
				if (len(self.doi) is 1 or len(self.doi) is 2):
					print 'Error DOI filename,',self._fname,',try recursive'
					return self.recursivedoicheck(excludedoi,olddoi=fdoi,wtitle=wtitle,cutoff=cutoff,justcheck=justcheck)				
			if not justcheck: 
				self.moveresult(6,"Error DOI fname(Fail):"+self._fname)
			return 6