示例#1
0
def download_one(recid, version):
    """Download given version of the PDF from arxiv"""
    write_message('fetching %s' % recid)
    for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)):
        if count != 0:
            write_message("Warning: %s has multiple arxiv #" % recid)
            continue

        url_for_pdf = build_arxiv_url(arxiv_id, version)
        filename_arxiv_id = arxiv_id.replace('/', '_')
        temp_file = NamedTemporaryFile(prefix="arxiv-pdf-checker",
                                       dir=CFG_TMPSHAREDDIR,
                                       suffix="%s.pdf" % filename_arxiv_id)
        write_message('downloading pdf from %s' % url_for_pdf)
        path = download_external_url(url_for_pdf,
                                     temp_file.name,
                                     content_type='pdf')

        # Check if it is not an html not found page
        filesize = os.path.getsize(path)
        if filesize < 25000:
            f = open(path)
            try:
                for line in f:
                    if 'PDF unavailable' in line:
                        raise PdfNotAvailable()
            finally:
                f.close()

        docs = BibRecDocs(recid)
        bibdocfiles = docs.list_latest_files(doctype="arXiv")

        needs_update = False
        try:
            bibdocfile = bibdocfiles[0]
        except IndexError:
            bibdocfile = None
            needs_update = True
        else:
            existing_md5 = calculate_md5(bibdocfile.fullpath)
            new_md5 = calculate_md5(path.encode('utf-8'))
            if new_md5 != existing_md5:
                write_message('md5 differs updating')
                needs_update = True
            else:
                write_message('md5 matches existing pdf, skipping')

        if needs_update:
            if bibdocfiles:
                write_message('adding as new version')
                docs.add_new_version(path, docname=bibdocfile.name)
            else:
                write_message('adding as new file')
                docs.add_new_file(path,
                                  doctype="arXiv",
                                  docname="arXiv:%s" % filename_arxiv_id)
        else:
            raise FoundExistingPdf()
示例#2
0
文件: arxiv.py 项目: Theer108/invenio
def download_one(recid, version):
    """Download given version of the PDF from arxiv"""
    write_message("fetching %s" % recid)
    for count, arxiv_id in enumerate(extract_arxiv_ids_from_recid(recid)):
        if count != 0:
            write_message("Warning: %s has multiple arxiv #" % recid)
            continue

        url_for_pdf = build_arxiv_url(arxiv_id, version)
        filename_arxiv_id = arxiv_id.replace("/", "_")
        temp_file = NamedTemporaryFile(
            prefix="arxiv-pdf-checker", dir=CFG_TMPSHAREDDIR, suffix="%s.pdf" % filename_arxiv_id
        )
        write_message("downloading pdf from %s" % url_for_pdf)
        path = download_external_url(url_for_pdf, temp_file.name, content_type="pdf")

        # Check if it is not an html not found page
        filesize = os.path.getsize(path)
        if filesize < 25000:
            f = open(path)
            try:
                for line in f:
                    if "PDF unavailable" in line:
                        raise PdfNotAvailable()
            finally:
                f.close()

        docs = BibRecDocs(recid)
        bibdocfiles = docs.list_latest_files(doctype="arXiv")

        needs_update = False
        try:
            bibdocfile = bibdocfiles[0]
        except IndexError:
            bibdocfile = None
            needs_update = True
        else:
            existing_md5 = calculate_md5(bibdocfile.fullpath)
            new_md5 = calculate_md5(path.encode("utf-8"))
            if new_md5 != existing_md5:
                write_message("md5 differs updating")
                needs_update = True
            else:
                write_message("md5 matches existing pdf, skipping")

        if needs_update:
            if bibdocfiles:
                write_message("adding as new version")
                docs.add_new_version(path, docname=bibdocfile.name)
            else:
                write_message("adding as new file")
                docs.add_new_file(path, doctype="arXiv", docname="arXiv:%s" % filename_arxiv_id)
        else:
            raise FoundExistingPdf()
示例#3
0
 def test_md5_algorithms(self):
     """bibdocfile - compare md5 algorithms"""
     from invenio.legacy.bibdocfile.api import calculate_md5, \
         calculate_md5_external
     filepath = os.path.join(self.path, 'test.txt')
     open(filepath, "w").write("test")
     self.assertEqual(calculate_md5(filepath, force_internal=True),
                      calculate_md5_external(filepath))
示例#4
0
 def test_md5_algorithms(self):
     """bibdocfile - compare md5 algorithms"""
     from invenio.legacy.bibdocfile.api import calculate_md5, \
         calculate_md5_external
     filepath = os.path.join(self.path, 'test.txt')
     open(filepath, "w").write("test")
     self.assertEqual(calculate_md5(filepath, force_internal=True),
                      calculate_md5_external(filepath))
示例#5
0
 def test_posting_file(self):
     """webstyle - direct posting of a file"""
     from invenio.legacy.bibdocfile.api import calculate_md5
     path = os.path.join(CFG_PREFIX, 'lib', 'webtest', 'invenio', 'test.gif')
     body = open(path).read()
     md5 = calculate_md5(path)
     mimetype = 'image/gif'
     connection = httplib.HTTPConnection(urlparse.urlsplit(CFG_SITE_URL)[1])
     connection.request('POST', '/httptest/post2', body, {'Content-MD5': md5, 'Content-Type': mimetype, 'Content-Disposition': 'filename=test.gif'})
     response = connection.getresponse()
     body2 = response.read()
     self.assertEqual(body, body2, "Body sent differs from body received")
示例#6
0
 def test_posting_file(self):
     """webstyle - direct posting of a file"""
     from invenio.legacy.bibdocfile.api import calculate_md5
     path = os.path.join(cfg['CFG_PREFIX'], 'lib', 'webtest', 'invenio',
                         'test.gif')
     body = open(path).read()
     md5 = calculate_md5(path)
     mimetype = 'image/gif'
     connection = httplib.HTTPConnection(
         urlparse.urlsplit(cfg['CFG_SITE_URL'])[1])
     connection.request(
         'POST', '/httptest/post2', body, {
             'Content-MD5': md5,
             'Content-Type': mimetype,
             'Content-Disposition': 'filename=test.gif'
         })
     response = connection.getresponse()
     body2 = response.read()
     self.assertEqual(body, body2,
                      "Body sent differs from body received")