def fetch_link(link): # Fetch our mirrored copy of the given link if available; # if not, mirror and return the original file url = link.download_url # find last mirrored download download = Download.objects.filter( link_class=link.link_class, parameter=link.parameter).exclude( mirror_s3_key='').order_by('-downloaded_at').first() if download: # existing download was found; fetch it return download.fetch_from_s3() else: # no mirrored copy exists - fetch and mirror the origin file try: blob = fetch_origin_url(url) except (urllib.error.URLError, FileTooBig) as ex: Download.objects.create(downloaded_at=datetime.datetime.now(), link_class=link.link_class, parameter=link.parameter, error_type=ex.__class__.__name__) raise download = Download( downloaded_at=datetime.datetime.now(), link_class=link.link_class, parameter=link.parameter, sha1=blob.sha1, md5=blob.md5, file_size=blob.file_size, ) # is there already a mirrored link with this sha1? existing_download = Download.objects.filter(sha1=blob.sha1).first() if existing_download: download.mirror_s3_key = existing_download.mirror_s3_key else: key_name = (blob.sha1[0:2] + '/' + blob.sha1[2:4] + '/' + blob.sha1[4:16] + '/' + clean_filename(blob.filename)) bucket = open_bucket() bucket.put_object(Key=key_name, Body=blob.file_content) download.mirror_s3_key = key_name download.save() if link.is_zip_file(): # catalogue the zipfile contents if we don't have them already if not ArchiveMember.objects.filter( archive_sha1=blob.sha1).exists(): z = blob.as_zipfile() for info in z.infolist(): # The Incredible Disaster of Platform Specific Implementations of Zip: # https://gist.github.com/jnalley/cec21bca2d865758bc5e23654df28bd5 # # Historically, zip files did not specify what character encoding the filename is using; # there is supposedly a flag to indicate 'yo this is utf-8' but it's unclear how widely # used/recognised it is, and you can bet that scene.org has some weird shit on it. # So, we consider the filename to be an arbitrary byte string. # # Since the database wants to store unicode strings, we decode the byte string as # iso-8859-1 to obtain one, and encode it as iso-8859-1 again on the way out of the # database. iso-8859-1 is chosen because it gives a well-defined result for any # arbitrary byte string, and doesn't unnecessarily mangle pure ASCII filenames. # # So, how do we get a byte string from the result of ZipFile.infolist? # Python 2 gives us a unicode string if the mythical utf-8 flag is set, # and a byte string otherwise. Our old python-2-only code called # filename.decode('iso-8859-1'), which would have failed on a unicode string containing # non-ascii characters, so we can assume that anything that made it as far as the # database originated either as pure ascii or a bytestring. Either way, calling # database_value.encode('iso-8859-1') would give a bytestring that python 2's zipfile # library can accept (i.e. it compares equal to the filename it originally gave us). # # Python 3 ALWAYS gives us a unicode string: decoded as utf-8 if the mythical flag is # set, or decoded as cp437 if not. We don't need to know which of these outcomes # happened; we just need to ensure that # 1) the transformation from unicode string to byte string is reversible, and # 2) the byte string representation matches the one that python 2 would have given us # for the same filename. # # The latter condition is satisfied by filename.encode('cp437'), which makes the # reverse tranformation bytestring.decode('cp437'). Therefore our final algorithm is: # # zipfile to database: # if filename is a unicode string (i.e. we are on py3 or the mythical flag is set): # filename = filename.encode('cp437') # filename is now a bytestring # return filename.decode('iso-8859-1') # # database to zipfile: # bytestring = database_value.encode('iso-8859-1') # if we are on py2: # return bytestring # else: # return bytestring.decode('cp437') # filename = info.filename if isinstance(filename, str): # pragma: no cover filename = filename.encode('cp437') filename = filename.decode('iso-8859-1') ArchiveMember.objects.get_or_create( filename=filename, file_size=info.file_size, archive_sha1=blob.sha1) return blob
def fetch_link(link): # Fetch our mirrored copy of the given link if available; # if not, mirror and return the original file url = link.download_url # find last mirrored download download = Download.objects.filter( link_class=link.link_class, parameter=link.parameter ).exclude(mirror_s3_key='').order_by('-downloaded_at').first() if download: # existing download was found; fetch it return download.fetch_from_s3() else: # no mirrored copy exists - fetch and mirror the origin file try: blob = fetch_origin_url(url) except (urllib2.URLError, FileTooBig) as ex: Download.objects.create( downloaded_at=datetime.datetime.now(), link_class=link.link_class, parameter=link.parameter, error_type=ex.__class__.__name__ ) raise download = Download( downloaded_at=datetime.datetime.now(), link_class=link.link_class, parameter=link.parameter, sha1=blob.sha1, md5=blob.md5, file_size=blob.file_size, ) # is there already a mirrored link with this sha1? existing_download = Download.objects.filter(sha1=blob.sha1).first() if existing_download: download.mirror_s3_key = existing_download.mirror_s3_key else: key_name = blob.sha1[0:2] + '/' + blob.sha1[2:4] + '/' + blob.sha1[4:16] + '/' + clean_filename(blob.filename) bucket = open_bucket() k = Key(bucket) k.key = key_name k.set_contents_from_string(blob.file_content) download.mirror_s3_key = key_name download.save() if link.is_zip_file(): # catalogue the zipfile contents if we don't have them already if not ArchiveMember.objects.filter(archive_sha1=blob.sha1).exists(): z = blob.as_zipfile() for info in z.infolist(): # zip files do not contain information about the character encoding of filenames. # We therefore decode the filename as iso-8859-1 (an encoding which defines a character # for every byte value) to ensure that it is *some* valid sequence of unicode characters # that can be inserted into the database. When we need to access this zipfile entry # again, we will re-encode it as iso-8859-1 to get back the original byte sequence. ArchiveMember.objects.get_or_create( filename=info.filename.decode('iso-8859-1'), file_size=info.file_size, archive_sha1=blob.sha1) return blob