'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0' } landingpage = requests.get(url, headers=headers) soup = BeautifulSoup(landingpage.text) pdflink = soup.find('a', {'id': 'pdfLink'}) if pdflink is not None: pdflink = pdflink['href'] else: # Try something else. pdflink = soup.find('div', {'id': 'article-download'}) if pdflink is None: print 'Could not find PDF link on ScienceDirect page.' return False pdflink = 'http:' + pdflink.a['href'] # Sciencedirect requires cookies for pdf downloads. pdf = requests.get(pdflink, cookies=landingpage.cookies, headers=headers).content if pdf[:4] != '%PDF': print 'You do not seem to have the permission to view this pdf.' return False with open(filename, 'wb') as f: f.write(pdf) return True import base base.register_module('http://www\.sciencedirect\.com/science/article/.*', { 'name': 'sciencedirect', 'download_pdf': download_pdf, })
def download_pdf(url, filename): headers = {"User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0"} landingpage = requests.get(url, headers=headers) soup = BeautifulSoup(landingpage.text) pdflink = soup.find("a", {"id": "pdfLink"}) if pdflink is not None: pdflink = pdflink["href"] else: # Try something else. pdflink = soup.find("div", {"id": "article-download"}) if pdflink is None: print "Could not find PDF link on ScienceDirect page." return False pdflink = "http:" + pdflink.a["href"] # Sciencedirect requires cookies for pdf downloads. pdf = requests.get(pdflink, cookies=landingpage.cookies, headers=headers).content if pdf[:4] != "%PDF": print "You do not seem to have the permission to view this pdf." return False with open(filename, "wb") as f: f.write(pdf) return True import base base.register_module( "http://www\.sciencedirect\.com/science/article/.*", {"name": "sciencedirect", "download_pdf": download_pdf} )
import requests, re download_pdf_regex = re.compile( '\s*<a name="FullTextPDF" title="FullText PDF" href="([^"]*).*') def download_pdf(url, filename): headers = { 'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0' } landingpage = requests.get(url, headers=headers) result = download_pdf_regex.search(landingpage.text) if result is None: return False fulltext_url = 'http://dl.acm.org/' + result.group(1) # ACM requires cookies for pdf downloads. pdf = requests.get(fulltext_url, cookies=landingpage.cookies, headers=headers).content with open(filename, 'wb') as f: f.write(pdf) return True import base base.register_module('http://dl\.acm\.org/citation\.cfm.*', { 'name': 'ACM', 'download_pdf': download_pdf, })
('ctl00$ctl14$SearchControl$BasicPublicationTextBox', ''), ('ctl00$ctl14$SearchControl$BasicVolumeTextBox', ''), ('ctl00$ctl14$SearchControl$BasicIssueTextBox', ''), ('ctl00$ctl14$SearchControl$BasicPageTextBox', ''), ('ctl00$ContentPrimary$ctl00$ctl00$Export', 'CitationOnlyRadioButton'), ('ctl00$ContentPrimary$ctl00$ctl00$CitationManagerDropDownList', 'BibTex'), ('ctl00$ContentPrimary$ctl00$ctl00$ExportCitationButton', 'Export+Citation'), ('__EVENTVALIDATION', eventvalidation) ]) return urllib.urlretrieve(url, filename, data=data) is not None def download_pdf_chapter(url, filename): return urllib.urlretrieve( url.replace('/chapter/', '/content/pdf/', 1) + '.pdf', filename) is not None import base base.register_module( 'http://www\.springerlink\.com/content/.*', { 'name': 'springerlink', 'download_pdf': download_pdf, 'download_bib': download_bib, }) base.register_module('http://link\.springer\.com/chapter/.*', { 'name': 'springerlink_chapter', 'download_pdf': download_pdf_chapter, })
import requests, re download_pdf_regex = re.compile('\s*<a name="FullTextPDF" title="FullText PDF" href="([^"]*).*') def download_pdf(url, filename): headers = { 'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0' } landingpage = requests.get(url, headers=headers) result = download_pdf_regex.search(landingpage.text) if result is None: return False fulltext_url = 'http://dl.acm.org/' + result.group(1) # ACM requires cookies for pdf downloads. pdf = requests.get(fulltext_url, cookies=landingpage.cookies, headers=headers).content with open(filename, 'wb') as f: f.write(pdf) return True import base base.register_module('http://dl\.acm\.org/citation\.cfm.*', {'name': 'ACM', 'download_pdf': download_pdf, })
import requests def download_pdf(url, filename): headers = { 'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0' } pdf = requests.get(url.replace('/abs/', '/pdf/'), headers=headers).content with open(filename, 'wb') as f: f.write(pdf) return True import base base.register_module('http://arxiv.org/abs/.*', {'name': 'arXiv', 'download_pdf': download_pdf, })