Пример #1
0
    def dlPaper(self) :
        for msg in self.msg :
            filename=''
            From=msg[0]
            url=msg[1]
            uf=self.urlFrom(url)

            if uf==1 or uf==2 :
                nature='http://www.nature.com'
                req=urllib2.Request(url)
                res=self.openner.open(req).read()
                parser=BeautifulSoup(res)
                pdf=parser.findAll(attrs={'class':re.compile(r'download')})
                for item in pdf :
                    s=re.search(r'href="(.*.pdf)"',str(item))
                    if s :
                        pdf=nature+s.group(1)
                        filename=pdf.rstrip('pdf').split('/')[-1]+str(time.strftime("%Y.%m.%d.%H.%M.%S",time.localtime()))+'.pdf'
                        urllib.urlretrieve(pdf,filename)
                        msg.append(filename)
                        msg.append(1)
                        break
            else :
                msg.append(filename)
                msg.append(0)
Пример #2
0
 def FromCell(self, msg):
     url = msg['Subject']
     pub = ''
     req = urllib2.Request(url)
     res = self.openner.open(req).read()
     parser = BeautifulSoup(res)
     pr = parser.findAll(attrs={'name': re.compile(r'citation_pdf_url')})
     for item in pr:
         s = re.search(r'content="(.*?.pdf)"', str(item))
         if s:
             pdf = pub + s.group(1)
             print(pdf)
             req = urllib2.Request(pdf)
             res = self.openner.open(req).read()
             parser = BeautifulSoup(res)
             pa = parser.findAll('p')
             for it in pa:
                 s = re.search(r'href="(.*?)"', str(it))
                 if s:
                     s = s.group(1).split('amp;')
                     pdf = ''.join(s)
                     print(pdf)
Пример #3
0
 def FromScience(self, msg):
     url = msg['Subject']
     pub = 'http://www.sciencemag.org'
     req = urllib2.Request(url)
     res = self.openner.open(req).read()
     parser = BeautifulSoup(res)
     pr = parser.findAll('a')
     for item in pr:
         s = re.search(r'href="(.*?.pdf)"', str(item))
         if s:
             pdf = pub + s.group(1)
             filename = pdf.rstrip('pdf').split('/')[-1] + str(
                 time.strftime("%Y.%m.%d.%H.%M.%S",
                               time.localtime())) + '.pdf'
             urllib.urlretrieve(pdf, filename)
             msg['FileName'] = filename
             msg['Flag'] = 1