def test_download_file(self): """Test if download_file works.""" httpretty.register_uri(httpretty.GET, "http://example.com/test.txt", body="Lorem ipsum\n", status=200) file_fd, file_name = tempfile.mkstemp() os.close(file_fd) download_file("http://example.com/test.txt", file_name) self.assertEqual("Lorem ipsum\n", open(file_name).read())
def test_download_file(self): """Test if download_file works.""" httpretty.register_uri( httpretty.GET, "http://example.com/test.txt", body="Lorem ipsum\n", status=200 ) file_fd, file_name = tempfile.mkstemp() os.close(file_fd) download_file("http://example.com/test.txt", file_name) self.assertEqual("Lorem ipsum\n", open(file_name).read())
def _attach_fulltext(self, rec, doi): url = 'http://dx.doi.org/' + doi page = requests.get(url) #url after redirect url = page.url page = page.text parsed_uri = urlparse(url) domain = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri) page = BeautifulSoup(page) try: if 'epjconf' in doi: div = page.body.find('div', attrs={'id': 'header'}) else: div = page.body.find('div', attrs={ 'class': 'module_background files' }) links = div.findAll('a') except AttributeError: return for pdf in links: if pdf['href'].endswith('pdf'): link_to_pdf = domain + pdf['href'] record_add_field(rec, '856', ind1='4', subfields=[('u', link_to_pdf), ('y', 'EDP Sciences server')]) out_folder = join(CFG_EDPSCIENCE_OUT_FOLDER, "fulltexts") try: makedirs(out_folder) filename = join(out_folder, link_to_pdf.split('/')[-1]) except (IOError, OSError): # Problem creating folder filename = None filename = download_file(from_url=link_to_pdf, to_filename=filename, retry_count=5) record_add_field(rec, 'FFT', subfields=[('a', filename), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')])
def _attach_fulltext(self, rec, doi): url = 'http://dx.doi.org/' + doi page = requests.get(url) #url after redirect url = page.url page = page.text parsed_uri = urlparse(url) domain = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri) page = BeautifulSoup(page) try: if 'epjconf' in doi: div = page.body.find('div', attrs={'id': 'header'}) else: div = page.body.find( 'div', attrs={'class': 'module_background files'}) links = div.findAll('a') except AttributeError: return for pdf in links: if pdf['href'].endswith('pdf'): link_to_pdf = domain + pdf['href'] record_add_field(rec, '856', ind1='4', subfields=[('u', link_to_pdf), ('y', 'EDP Sciences server')]) out_folder = join(CFG_EDPSCIENCE_OUT_FOLDER, "fulltexts") try: makedirs(out_folder) filename = join(out_folder, link_to_pdf.split('/')[-1]) except (IOError, OSError): # Problem creating folder filename = None filename = download_file(from_url=link_to_pdf, to_filename=filename, retry_count=5) record_add_field(rec, 'FFT', subfields=[('a', filename), ('t', 'INSPIRE-PUBLIC'), ('d', 'Fulltext')])