def download_one(recid, version): """Download given version of the PDF from arxiv""" write_message('fetching %s' % recid) for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)): if count != 0: write_message("Warning: %s has multiple arxiv #" % recid) continue url_for_pdf = build_arxiv_url(arxiv_id, version) filename_arxiv_id = arxiv_id.replace('/', '_') temp_file = NamedTemporaryFile(prefix="arxiv-pdf-checker", dir=CFG_TMPSHAREDDIR, suffix="%s.pdf" % filename_arxiv_id) write_message('downloading pdf from %s' % url_for_pdf) path = download_external_url(url_for_pdf, temp_file.name, content_type='pdf') # Check if it is not an html not found page filesize = os.path.getsize(path) if filesize < 25000: f = open(path) try: for line in f: if 'PDF unavailable' in line: raise PdfNotAvailable() finally: f.close() docs = BibRecDocs(recid) bibdocfiles = docs.list_latest_files(doctype="arXiv") needs_update = False try: bibdocfile = bibdocfiles[0] except IndexError: bibdocfile = None needs_update = True else: existing_md5 = calculate_md5(bibdocfile.fullpath) new_md5 = calculate_md5(path.encode('utf-8')) if new_md5 != existing_md5: write_message('md5 differs updating') needs_update = True else: write_message('md5 matches existing pdf, skipping') if needs_update: if bibdocfiles: write_message('adding as new version') docs.add_new_version(path, docname=bibdocfile.name) else: write_message('adding as new file') docs.add_new_file(path, doctype="arXiv", docname="arXiv:%s" % filename_arxiv_id) else: raise FoundExistingPdf()
def download_one(recid, version): """Download given version of the PDF from arxiv""" write_message("fetching %s" % recid) for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)): if count != 0: write_message("Warning: %s has multiple arxiv #" % recid) continue url_for_pdf = build_arxiv_url(arxiv_id, version) filename_arxiv_id = arxiv_id.replace("/", "_") temp_file = NamedTemporaryFile( prefix="arxiv-pdf-checker", dir=CFG_TMPSHAREDDIR, suffix="%s.pdf" % filename_arxiv_id ) write_message("downloading pdf from %s" % url_for_pdf) path = download_external_url(url_for_pdf, temp_file.name, content_type="pdf") # Check if it is not an html not found page filesize = os.path.getsize(path) if filesize < 25000: f = open(path) try: for line in f: if "PDF unavailable" in line: raise PdfNotAvailable() finally: f.close() docs = BibRecDocs(recid) bibdocfiles = docs.list_latest_files(doctype="arXiv") needs_update = False try: bibdocfile = bibdocfiles[0] except IndexError: bibdocfile = None needs_update = True else: existing_md5 = calculate_md5(bibdocfile.fullpath) new_md5 = calculate_md5(path.encode("utf-8")) if new_md5 != existing_md5: write_message("md5 differs updating") needs_update = True else: write_message("md5 matches existing pdf, skipping") if needs_update: if bibdocfiles: write_message("adding as new version") docs.add_new_version(path, docname=bibdocfile.name) else: write_message("adding as new file") docs.add_new_file(path, doctype="arXiv", docname="arXiv:%s" % filename_arxiv_id) else: raise FoundExistingPdf()
def test_md5_algorithms(self): """bibdocfile - compare md5 algorithms""" from invenio.legacy.bibdocfile.api import calculate_md5, \ calculate_md5_external filepath = os.path.join(self.path, 'test.txt') open(filepath, "w").write("test") self.assertEqual(calculate_md5(filepath, force_internal=True), calculate_md5_external(filepath))
def test_posting_file(self): """webstyle - direct posting of a file""" from invenio.legacy.bibdocfile.api import calculate_md5 path = os.path.join(CFG_PREFIX, 'lib', 'webtest', 'invenio', 'test.gif') body = open(path).read() md5 = calculate_md5(path) mimetype = 'image/gif' connection = httplib.HTTPConnection(urlparse.urlsplit(CFG_SITE_URL)[1]) connection.request('POST', '/httptest/post2', body, {'Content-MD5': md5, 'Content-Type': mimetype, 'Content-Disposition': 'filename=test.gif'}) response = connection.getresponse() body2 = response.read() self.assertEqual(body, body2, "Body sent differs from body received")
def test_posting_file(self): """webstyle - direct posting of a file""" from invenio.legacy.bibdocfile.api import calculate_md5 path = os.path.join(cfg['CFG_PREFIX'], 'lib', 'webtest', 'invenio', 'test.gif') body = open(path).read() md5 = calculate_md5(path) mimetype = 'image/gif' connection = httplib.HTTPConnection( urlparse.urlsplit(cfg['CFG_SITE_URL'])[1]) connection.request( 'POST', '/httptest/post2', body, { 'Content-MD5': md5, 'Content-Type': mimetype, 'Content-Disposition': 'filename=test.gif' }) response = connection.getresponse() body2 = response.read() self.assertEqual(body, body2, "Body sent differs from body received")