예제 #1
0
파일: actions.py 프로젝트: asbjornu/demozoo
def find_zipped_screenshottable_graphics():
	# Return a set of ProductionLink objects that link to archive files,
	# that we can plausibly expect to extract screenshots from, for productions that don't
	# have screenshots already.

	# prods of supertype=graphics that have download links but no screenshots
	from django.db.models import Count
	prods = Production.objects.annotate(screenshot_count=Count('screenshots')).filter(
		supertype='graphics', screenshot_count=0, links__is_download_link=True).prefetch_related('links', 'platforms', 'types')

	prod_links = []
	for prod in prods:
		for link in prod.links.all():

			if not (link.is_download_link and link.is_zip_file()):
				continue

			# skip ASCII and executable graphics
			if prod.types.filter(internal_name__in=['ascii', 'ascii-collection', 'ansi', 'exe-graphics', '4k-exe-graphics']):
				continue

			# skip prods for a specific platform other than DOS/Windows
			if prod.platforms.exclude(name__in=['MS-Dos', 'Windows']):
				continue

			file_for_screenshot = None
			# see if we've already got a best candidate archive member to take the image from
			if link.file_for_screenshot:
				file_for_screenshot = link.file_for_screenshot
			else:
				# failing that, see if we already have a directory listing for this download
				# and can derive a candidate from that
				download = Download.last_mirrored_download_for_url(link.download_url)
				if download and download.archive_members.count():
					file_for_screenshot = download.select_screenshot_file()
					if file_for_screenshot:
						# we've found a candidate (which probably means we've improved select_screenshot_file
						# since it was last run on this archive) - might as well store it against the
						# ProductionLink, so it doesn't show up as something to be manually resolved
						link.file_for_screenshot = file_for_screenshot
						link.is_unresolved_for_screenshotting = False
						link.save()
					else:
						# we have a directory listing but no clear candidate, so give up on this link
						link.is_unresolved_for_screenshotting = True
						link.save()
						continue

			if file_for_screenshot:
				# we know in advance which file we'd like to extract from the archive -
				# better make sure it's a format we can actually handle, then.
				extension = link.file_for_screenshot.split('.')[-1].lower()
				if extension not in USABLE_IMAGE_FILE_EXTENSIONS:
					continue

			prod_links.append(link)
			break  # success, so ignore any remaining links for this prod

	return prod_links
예제 #2
0
def find_zipped_screenshottable_graphics():
	# Return a set of ProductionLink objects that link to archive files,
	# that we can plausibly expect to extract screenshots from, for productions that don't
	# have screenshots already.

	# prods of supertype=graphics that have download links but no screenshots
	from django.db.models import Count
	prods = Production.objects.annotate(screenshot_count=Count('screenshots')).filter(
		supertype='graphics', screenshot_count=0, links__is_download_link=True).prefetch_related('links', 'platforms', 'types')

	prod_links = []
	for prod in prods:
		for link in prod.links.all():

			if not (link.is_download_link and link.is_zip_file()):
				continue

			# skip ASCII and executable graphics
			if prod.types.filter(internal_name__in=['ascii', 'ascii-collection', 'ansi', 'exe-graphics', '4k-exe-graphics']):
				continue

			# skip prods for a specific platform other than DOS/Windows
			if prod.platforms.exclude(name__in=['MS-Dos', 'Windows']):
				continue

			file_for_screenshot = None
			# see if we've already got a best candidate archive member to take the image from
			if link.file_for_screenshot:
				file_for_screenshot = link.file_for_screenshot
			else:
				# failing that, see if we already have a directory listing for this download
				# and can derive a candidate from that
				download = Download.last_mirrored_download_for_url(link.download_url)
				if download and download.archive_members.count():
					file_for_screenshot = download.select_screenshot_file()
					if file_for_screenshot:
						# we've found a candidate (which probably means we've improved select_screenshot_file
						# since it was last run on this archive) - might as well store it against the
						# ProductionLink, so it doesn't show up as something to be manually resolved
						link.file_for_screenshot = file_for_screenshot
						link.is_unresolved_for_screenshotting = False
						link.save()
					else:
						# we have a directory listing but no clear candidate, so give up on this link
						link.is_unresolved_for_screenshotting = True
						link.save()
						continue

			if file_for_screenshot:
				# we know in advance which file we'd like to extract from the archive -
				# better make sure it's a format we can actually handle, then.
				extension = link.file_for_screenshot.split('.')[-1].lower()
				if extension not in USABLE_IMAGE_FILE_EXTENSIONS:
					continue

			prod_links.append(link)
			break  # success, so ignore any remaining links for this prod

	return prod_links
예제 #3
0
파일: views.py 프로젝트: asbjornu/demozoo
def unresolved_screenshots(request):
	links = ProductionLink.objects.filter(is_unresolved_for_screenshotting=True).select_related('production')

	entries = []
	for link in links[:100]:
		download = Download.last_mirrored_download_for_url(link.download_url)
		if download:
			entries.append((link, download, download.archive_members.all()))

	return render(request, 'maintenance/unresolved_screenshots.html', {
		'title': 'Unresolved screenshots',
		'link_count': links.count(),
		'entries': entries,
		'report_name': 'unresolved_screenshots',
	})
예제 #4
0
파일: views.py 프로젝트: nswaldman/demozoo
def unresolved_screenshots(request):
    links = ProductionLink.objects.filter(
        is_unresolved_for_screenshotting=True).select_related('production')

    entries = []
    for link in links[:100]:
        download = Download.last_mirrored_download_for_url(link.download_url)
        if download:
            entries.append((link, download, download.archive_members.all()))

    return render(
        request, 'maintenance/unresolved_screenshots.html', {
            'title': 'Unresolved screenshots',
            'link_count': links.count(),
            'entries': entries,
            'report_name': 'unresolved_screenshots',
        })
예제 #5
0
def fetch_url(url):
    # Fetch our mirrored copy of the given URL if available;
    # if not, mirror and return the original file

    download = Download.last_mirrored_download_for_url(url)
    if download:
        # existing download was found; fetch it
        return download, download.fetch_from_s3()
    else:
        # no mirrored copy exists - fetch and mirror the origin file
        try:
            remote_filename, file_content = fetch_origin_url(url)
        except (urllib2.URLError, FileTooBig) as ex:
            Download.objects.create(url=url,
                                    downloaded_at=datetime.datetime.now(),
                                    error_type=ex.__class__.__name__)
            raise
        download = upload_to_mirror(url, remote_filename, file_content)
        return download, file_content
예제 #6
0
파일: actions.py 프로젝트: asbjornu/demozoo
def fetch_url(url):
	# Fetch our mirrored copy of the given URL if available;
	# if not, mirror and return the original file

	download = Download.last_mirrored_download_for_url(url)
	if download:
		# existing download was found; fetch it
		return download, download.fetch_from_s3()
	else:
		# no mirrored copy exists - fetch and mirror the origin file
		try:
			remote_filename, file_content = fetch_origin_url(url)
		except (urllib2.URLError, FileTooBig) as ex:
			Download.objects.create(
				url=url,
				downloaded_at=datetime.datetime.now(),
				error_type=ex.__class__.__name__
			)
			raise
		download = upload_to_mirror(url, remote_filename, file_content)
		return download, file_content
예제 #7
0
def fetch_link(link):
    # Fetch our mirrored copy of the given link if available;
    # if not, mirror and return the original file

    url = link.download_url

    # find last mirrored download
    download = Download.objects.filter(
        link_class=link.link_class, parameter=link.parameter).exclude(
            mirror_s3_key='').order_by('-downloaded_at').first()

    if download:
        # existing download was found; fetch it
        return download.fetch_from_s3()
    else:
        # no mirrored copy exists - fetch and mirror the origin file
        try:
            blob = fetch_origin_url(url)
        except (urllib.error.URLError, FileTooBig) as ex:
            Download.objects.create(downloaded_at=datetime.datetime.now(),
                                    link_class=link.link_class,
                                    parameter=link.parameter,
                                    error_type=ex.__class__.__name__)
            raise

        download = Download(
            downloaded_at=datetime.datetime.now(),
            link_class=link.link_class,
            parameter=link.parameter,
            sha1=blob.sha1,
            md5=blob.md5,
            file_size=blob.file_size,
        )

        # is there already a mirrored link with this sha1?
        existing_download = Download.objects.filter(sha1=blob.sha1).first()
        if existing_download:
            download.mirror_s3_key = existing_download.mirror_s3_key
        else:
            key_name = (blob.sha1[0:2] + '/' + blob.sha1[2:4] + '/' +
                        blob.sha1[4:16] + '/' + clean_filename(blob.filename))
            bucket = open_bucket()
            bucket.put_object(Key=key_name, Body=blob.file_content)
            download.mirror_s3_key = key_name

        download.save()

        if link.is_zip_file():
            # catalogue the zipfile contents if we don't have them already
            if not ArchiveMember.objects.filter(
                    archive_sha1=blob.sha1).exists():
                z = blob.as_zipfile()
                for info in z.infolist():
                    # The Incredible Disaster of Platform Specific Implementations of Zip:
                    # https://gist.github.com/jnalley/cec21bca2d865758bc5e23654df28bd5
                    #
                    # Historically, zip files did not specify what character encoding the filename is using;
                    # there is supposedly a flag to indicate 'yo this is utf-8' but it's unclear how widely
                    # used/recognised it is, and you can bet that scene.org has some weird shit on it.
                    # So, we consider the filename to be an arbitrary byte string.
                    #
                    # Since the database wants to store unicode strings, we decode the byte string as
                    # iso-8859-1 to obtain one, and encode it as iso-8859-1 again on the way out of the
                    # database. iso-8859-1 is chosen because it gives a well-defined result for any
                    # arbitrary byte string, and doesn't unnecessarily mangle pure ASCII filenames.
                    #
                    # So, how do we get a byte string from the result of ZipFile.infolist?
                    # Python 2 gives us a unicode string if the mythical utf-8 flag is set,
                    # and a byte string otherwise. Our old python-2-only code called
                    # filename.decode('iso-8859-1'), which would have failed on a unicode string containing
                    # non-ascii characters, so we can assume that anything that made it as far as the
                    # database originated either as pure ascii or a bytestring. Either way, calling
                    # database_value.encode('iso-8859-1') would give a bytestring that python 2's zipfile
                    # library can accept (i.e. it compares equal to the filename it originally gave us).
                    #
                    # Python 3 ALWAYS gives us a unicode string: decoded as utf-8 if the mythical flag is
                    # set, or decoded as cp437 if not. We don't need to know which of these outcomes
                    # happened; we just need to ensure that
                    # 1) the transformation from unicode string to byte string is reversible, and
                    # 2) the byte string representation matches the one that python 2 would have given us
                    # for the same filename.
                    #
                    # The latter condition is satisfied by filename.encode('cp437'), which makes the
                    # reverse tranformation bytestring.decode('cp437'). Therefore our final algorithm is:
                    #
                    # zipfile to database:
                    # if filename is a unicode string (i.e. we are on py3 or the mythical flag is set):
                    #     filename = filename.encode('cp437')  # filename is now a bytestring
                    # return filename.decode('iso-8859-1')
                    #
                    # database to zipfile:
                    # bytestring = database_value.encode('iso-8859-1')
                    # if we are on py2:
                    #     return bytestring
                    # else:
                    #     return bytestring.decode('cp437')
                    #

                    filename = info.filename
                    if isinstance(filename, str):  # pragma: no cover
                        filename = filename.encode('cp437')
                    filename = filename.decode('iso-8859-1')

                    ArchiveMember.objects.get_or_create(
                        filename=filename,
                        file_size=info.file_size,
                        archive_sha1=blob.sha1)

        return blob
예제 #8
0
파일: actions.py 프로젝트: n86cc/demozoo
def fetch_link(link):
    # Fetch our mirrored copy of the given link if available;
    # if not, mirror and return the original file

    url = link.download_url

    # find last mirrored download
    download = Download.objects.filter(
        link_class=link.link_class, parameter=link.parameter
    ).exclude(mirror_s3_key='').order_by('-downloaded_at').first()

    if download:
        # existing download was found; fetch it
        return download.fetch_from_s3()
    else:
        # no mirrored copy exists - fetch and mirror the origin file
        try:
            blob = fetch_origin_url(url)
        except (urllib2.URLError, FileTooBig) as ex:
            Download.objects.create(
                downloaded_at=datetime.datetime.now(),
                link_class=link.link_class,
                parameter=link.parameter,
                error_type=ex.__class__.__name__
            )
            raise

        download = Download(
            downloaded_at=datetime.datetime.now(),
            link_class=link.link_class,
            parameter=link.parameter,
            sha1=blob.sha1,
            md5=blob.md5,
            file_size=blob.file_size,
        )

        # is there already a mirrored link with this sha1?
        existing_download = Download.objects.filter(sha1=blob.sha1).first()
        if existing_download:
            download.mirror_s3_key = existing_download.mirror_s3_key
        else:
            key_name = blob.sha1[0:2] + '/' + blob.sha1[2:4] + '/' + blob.sha1[4:16] + '/' + clean_filename(blob.filename)
            bucket = open_bucket()
            k = Key(bucket)
            k.key = key_name
            k.set_contents_from_string(blob.file_content)
            download.mirror_s3_key = key_name

        download.save()

        if link.is_zip_file():
            # catalogue the zipfile contents if we don't have them already
            if not ArchiveMember.objects.filter(archive_sha1=blob.sha1).exists():
                z = blob.as_zipfile()
                for info in z.infolist():
                    # zip files do not contain information about the character encoding of filenames.
                    # We therefore decode the filename as iso-8859-1 (an encoding which defines a character
                    # for every byte value) to ensure that it is *some* valid sequence of unicode characters
                    # that can be inserted into the database. When we need to access this zipfile entry
                    # again, we will re-encode it as iso-8859-1 to get back the original byte sequence.
                    ArchiveMember.objects.get_or_create(
                        filename=info.filename.decode('iso-8859-1'),
                        file_size=info.file_size,
                        archive_sha1=blob.sha1)

        return blob
예제 #9
0
def fetch_link(link):
	# Fetch our mirrored copy of the given link if available;
	# if not, mirror and return the original file

	url = link.download_url

	# find last mirrored download
	download = Download.objects.filter(
		link_class=link.link_class, parameter=link.parameter
	).exclude(mirror_s3_key='').order_by('-downloaded_at').first()

	if download:
		# existing download was found; fetch it
		return download.fetch_from_s3()
	else:
		# no mirrored copy exists - fetch and mirror the origin file
		try:
			blob = fetch_origin_url(url)
		except (urllib2.URLError, FileTooBig) as ex:
			Download.objects.create(
				downloaded_at=datetime.datetime.now(),
				link_class=link.link_class,
				parameter=link.parameter,
				error_type=ex.__class__.__name__
			)
			raise

		download = Download(
			downloaded_at=datetime.datetime.now(),
			link_class=link.link_class,
			parameter=link.parameter,
			sha1=blob.sha1,
			md5=blob.md5,
			file_size=blob.file_size,
		)

		# is there already a mirrored link with this sha1?
		existing_download = Download.objects.filter(sha1=blob.sha1).first()
		if existing_download:
			download.mirror_s3_key = existing_download.mirror_s3_key
		else:
			key_name = blob.sha1[0:2] + '/' + blob.sha1[2:4] + '/' + blob.sha1[4:16] + '/' + clean_filename(blob.filename)
			bucket = open_bucket()
			k = Key(bucket)
			k.key = key_name
			k.set_contents_from_string(blob.file_content)
			download.mirror_s3_key = key_name

		download.save()

		if link.is_zip_file():
			# catalogue the zipfile contents if we don't have them already
			if not ArchiveMember.objects.filter(archive_sha1=blob.sha1).exists():
				z = blob.as_zipfile()
				for info in z.infolist():
					# zip files do not contain information about the character encoding of filenames.
					# We therefore decode the filename as iso-8859-1 (an encoding which defines a character
					# for every byte value) to ensure that it is *some* valid sequence of unicode characters
					# that can be inserted into the database. When we need to access this zipfile entry
					# again, we will re-encode it as iso-8859-1 to get back the original byte sequence.
					ArchiveMember.objects.get_or_create(
						filename=info.filename.decode('iso-8859-1'),
						file_size=info.file_size,
						archive_sha1=blob.sha1)

		return blob