def find_zipped_screenshottable_graphics(): # Return a set of ProductionLink objects that link to archive files, # that we can plausibly expect to extract screenshots from, for productions that don't # have screenshots already. # prods of supertype=graphics that have download links but no screenshots from django.db.models import Count prods = Production.objects.annotate(screenshot_count=Count('screenshots')).filter( supertype='graphics', screenshot_count=0, links__is_download_link=True).prefetch_related('links', 'platforms', 'types') prod_links = [] for prod in prods: for link in prod.links.all(): if not (link.is_download_link and link.is_zip_file()): continue # skip ASCII and executable graphics if prod.types.filter(internal_name__in=['ascii', 'ascii-collection', 'ansi', 'exe-graphics', '4k-exe-graphics']): continue # skip prods for a specific platform other than DOS/Windows if prod.platforms.exclude(name__in=['MS-Dos', 'Windows']): continue file_for_screenshot = None # see if we've already got a best candidate archive member to take the image from if link.file_for_screenshot: file_for_screenshot = link.file_for_screenshot else: # failing that, see if we already have a directory listing for this download # and can derive a candidate from that download = Download.last_mirrored_download_for_url(link.download_url) if download and download.archive_members.count(): file_for_screenshot = download.select_screenshot_file() if file_for_screenshot: # we've found a candidate (which probably means we've improved select_screenshot_file # since it was last run on this archive) - might as well store it against the # ProductionLink, so it doesn't show up as something to be manually resolved link.file_for_screenshot = file_for_screenshot link.is_unresolved_for_screenshotting = False link.save() else: # we have a directory listing but no clear candidate, so give up on this link link.is_unresolved_for_screenshotting = True link.save() continue if file_for_screenshot: # we know in advance which file we'd like to extract from the archive - # better make sure it's a format we can actually handle, then. extension = link.file_for_screenshot.split('.')[-1].lower() if extension not in USABLE_IMAGE_FILE_EXTENSIONS: continue prod_links.append(link) break # success, so ignore any remaining links for this prod return prod_links
def unresolved_screenshots(request): links = ProductionLink.objects.filter(is_unresolved_for_screenshotting=True).select_related('production') entries = [] for link in links[:100]: download = Download.last_mirrored_download_for_url(link.download_url) if download: entries.append((link, download, download.archive_members.all())) return render(request, 'maintenance/unresolved_screenshots.html', { 'title': 'Unresolved screenshots', 'link_count': links.count(), 'entries': entries, 'report_name': 'unresolved_screenshots', })
def unresolved_screenshots(request): links = ProductionLink.objects.filter( is_unresolved_for_screenshotting=True).select_related('production') entries = [] for link in links[:100]: download = Download.last_mirrored_download_for_url(link.download_url) if download: entries.append((link, download, download.archive_members.all())) return render( request, 'maintenance/unresolved_screenshots.html', { 'title': 'Unresolved screenshots', 'link_count': links.count(), 'entries': entries, 'report_name': 'unresolved_screenshots', })
def fetch_url(url): # Fetch our mirrored copy of the given URL if available; # if not, mirror and return the original file download = Download.last_mirrored_download_for_url(url) if download: # existing download was found; fetch it return download, download.fetch_from_s3() else: # no mirrored copy exists - fetch and mirror the origin file try: remote_filename, file_content = fetch_origin_url(url) except (urllib2.URLError, FileTooBig) as ex: Download.objects.create(url=url, downloaded_at=datetime.datetime.now(), error_type=ex.__class__.__name__) raise download = upload_to_mirror(url, remote_filename, file_content) return download, file_content
def fetch_url(url): # Fetch our mirrored copy of the given URL if available; # if not, mirror and return the original file download = Download.last_mirrored_download_for_url(url) if download: # existing download was found; fetch it return download, download.fetch_from_s3() else: # no mirrored copy exists - fetch and mirror the origin file try: remote_filename, file_content = fetch_origin_url(url) except (urllib2.URLError, FileTooBig) as ex: Download.objects.create( url=url, downloaded_at=datetime.datetime.now(), error_type=ex.__class__.__name__ ) raise download = upload_to_mirror(url, remote_filename, file_content) return download, file_content
def fetch_link(link): # Fetch our mirrored copy of the given link if available; # if not, mirror and return the original file url = link.download_url # find last mirrored download download = Download.objects.filter( link_class=link.link_class, parameter=link.parameter).exclude( mirror_s3_key='').order_by('-downloaded_at').first() if download: # existing download was found; fetch it return download.fetch_from_s3() else: # no mirrored copy exists - fetch and mirror the origin file try: blob = fetch_origin_url(url) except (urllib.error.URLError, FileTooBig) as ex: Download.objects.create(downloaded_at=datetime.datetime.now(), link_class=link.link_class, parameter=link.parameter, error_type=ex.__class__.__name__) raise download = Download( downloaded_at=datetime.datetime.now(), link_class=link.link_class, parameter=link.parameter, sha1=blob.sha1, md5=blob.md5, file_size=blob.file_size, ) # is there already a mirrored link with this sha1? existing_download = Download.objects.filter(sha1=blob.sha1).first() if existing_download: download.mirror_s3_key = existing_download.mirror_s3_key else: key_name = (blob.sha1[0:2] + '/' + blob.sha1[2:4] + '/' + blob.sha1[4:16] + '/' + clean_filename(blob.filename)) bucket = open_bucket() bucket.put_object(Key=key_name, Body=blob.file_content) download.mirror_s3_key = key_name download.save() if link.is_zip_file(): # catalogue the zipfile contents if we don't have them already if not ArchiveMember.objects.filter( archive_sha1=blob.sha1).exists(): z = blob.as_zipfile() for info in z.infolist(): # The Incredible Disaster of Platform Specific Implementations of Zip: # https://gist.github.com/jnalley/cec21bca2d865758bc5e23654df28bd5 # # Historically, zip files did not specify what character encoding the filename is using; # there is supposedly a flag to indicate 'yo this is utf-8' but it's unclear how widely # used/recognised it is, and you can bet that scene.org has some weird shit on it. # So, we consider the filename to be an arbitrary byte string. # # Since the database wants to store unicode strings, we decode the byte string as # iso-8859-1 to obtain one, and encode it as iso-8859-1 again on the way out of the # database. iso-8859-1 is chosen because it gives a well-defined result for any # arbitrary byte string, and doesn't unnecessarily mangle pure ASCII filenames. # # So, how do we get a byte string from the result of ZipFile.infolist? # Python 2 gives us a unicode string if the mythical utf-8 flag is set, # and a byte string otherwise. Our old python-2-only code called # filename.decode('iso-8859-1'), which would have failed on a unicode string containing # non-ascii characters, so we can assume that anything that made it as far as the # database originated either as pure ascii or a bytestring. Either way, calling # database_value.encode('iso-8859-1') would give a bytestring that python 2's zipfile # library can accept (i.e. it compares equal to the filename it originally gave us). # # Python 3 ALWAYS gives us a unicode string: decoded as utf-8 if the mythical flag is # set, or decoded as cp437 if not. We don't need to know which of these outcomes # happened; we just need to ensure that # 1) the transformation from unicode string to byte string is reversible, and # 2) the byte string representation matches the one that python 2 would have given us # for the same filename. # # The latter condition is satisfied by filename.encode('cp437'), which makes the # reverse tranformation bytestring.decode('cp437'). Therefore our final algorithm is: # # zipfile to database: # if filename is a unicode string (i.e. we are on py3 or the mythical flag is set): # filename = filename.encode('cp437') # filename is now a bytestring # return filename.decode('iso-8859-1') # # database to zipfile: # bytestring = database_value.encode('iso-8859-1') # if we are on py2: # return bytestring # else: # return bytestring.decode('cp437') # filename = info.filename if isinstance(filename, str): # pragma: no cover filename = filename.encode('cp437') filename = filename.decode('iso-8859-1') ArchiveMember.objects.get_or_create( filename=filename, file_size=info.file_size, archive_sha1=blob.sha1) return blob
def fetch_link(link): # Fetch our mirrored copy of the given link if available; # if not, mirror and return the original file url = link.download_url # find last mirrored download download = Download.objects.filter( link_class=link.link_class, parameter=link.parameter ).exclude(mirror_s3_key='').order_by('-downloaded_at').first() if download: # existing download was found; fetch it return download.fetch_from_s3() else: # no mirrored copy exists - fetch and mirror the origin file try: blob = fetch_origin_url(url) except (urllib2.URLError, FileTooBig) as ex: Download.objects.create( downloaded_at=datetime.datetime.now(), link_class=link.link_class, parameter=link.parameter, error_type=ex.__class__.__name__ ) raise download = Download( downloaded_at=datetime.datetime.now(), link_class=link.link_class, parameter=link.parameter, sha1=blob.sha1, md5=blob.md5, file_size=blob.file_size, ) # is there already a mirrored link with this sha1? existing_download = Download.objects.filter(sha1=blob.sha1).first() if existing_download: download.mirror_s3_key = existing_download.mirror_s3_key else: key_name = blob.sha1[0:2] + '/' + blob.sha1[2:4] + '/' + blob.sha1[4:16] + '/' + clean_filename(blob.filename) bucket = open_bucket() k = Key(bucket) k.key = key_name k.set_contents_from_string(blob.file_content) download.mirror_s3_key = key_name download.save() if link.is_zip_file(): # catalogue the zipfile contents if we don't have them already if not ArchiveMember.objects.filter(archive_sha1=blob.sha1).exists(): z = blob.as_zipfile() for info in z.infolist(): # zip files do not contain information about the character encoding of filenames. # We therefore decode the filename as iso-8859-1 (an encoding which defines a character # for every byte value) to ensure that it is *some* valid sequence of unicode characters # that can be inserted into the database. When we need to access this zipfile entry # again, we will re-encode it as iso-8859-1 to get back the original byte sequence. ArchiveMember.objects.get_or_create( filename=info.filename.decode('iso-8859-1'), file_size=info.file_size, archive_sha1=blob.sha1) return blob