def dlPaper(self) : for msg in self.msg : filename='' From=msg[0] url=msg[1] uf=self.urlFrom(url) if uf==1 or uf==2 : nature='http://www.nature.com' req=urllib2.Request(url) res=self.openner.open(req).read() parser=BeautifulSoup(res) pdf=parser.findAll(attrs={'class':re.compile(r'download')}) for item in pdf : s=re.search(r'href="(.*.pdf)"',str(item)) if s : pdf=nature+s.group(1) filename=pdf.rstrip('pdf').split('/')[-1]+str(time.strftime("%Y.%m.%d.%H.%M.%S",time.localtime()))+'.pdf' urllib.urlretrieve(pdf,filename) msg.append(filename) msg.append(1) break else : msg.append(filename) msg.append(0)
def FromCell(self, msg): url = msg['Subject'] pub = '' req = urllib2.Request(url) res = self.openner.open(req).read() parser = BeautifulSoup(res) pr = parser.findAll(attrs={'name': re.compile(r'citation_pdf_url')}) for item in pr: s = re.search(r'content="(.*?.pdf)"', str(item)) if s: pdf = pub + s.group(1) print(pdf) req = urllib2.Request(pdf) res = self.openner.open(req).read() parser = BeautifulSoup(res) pa = parser.findAll('p') for it in pa: s = re.search(r'href="(.*?)"', str(it)) if s: s = s.group(1).split('amp;') pdf = ''.join(s) print(pdf)
def FromScience(self, msg): url = msg['Subject'] pub = 'http://www.sciencemag.org' req = urllib2.Request(url) res = self.openner.open(req).read() parser = BeautifulSoup(res) pr = parser.findAll('a') for item in pr: s = re.search(r'href="(.*?.pdf)"', str(item)) if s: pdf = pub + s.group(1) filename = pdf.rstrip('pdf').split('/')[-1] + str( time.strftime("%Y.%m.%d.%H.%M.%S", time.localtime())) + '.pdf' urllib.urlretrieve(pdf, filename) msg['FileName'] = filename msg['Flag'] = 1