예제 #1
0
파일: actions.py 프로젝트: n86cc/demozoo
def fetch_link(link):
    # Fetch our mirrored copy of the given link if available;
    # if not, mirror and return the original file

    url = link.download_url

    # find last mirrored download
    download = Download.objects.filter(
        link_class=link.link_class, parameter=link.parameter
    ).exclude(mirror_s3_key='').order_by('-downloaded_at').first()

    if download:
        # existing download was found; fetch it
        return download.fetch_from_s3()
    else:
        # no mirrored copy exists - fetch and mirror the origin file
        try:
            blob = fetch_origin_url(url)
        except (urllib2.URLError, FileTooBig) as ex:
            Download.objects.create(
                downloaded_at=datetime.datetime.now(),
                link_class=link.link_class,
                parameter=link.parameter,
                error_type=ex.__class__.__name__
            )
            raise

        download = Download(
            downloaded_at=datetime.datetime.now(),
            link_class=link.link_class,
            parameter=link.parameter,
            sha1=blob.sha1,
            md5=blob.md5,
            file_size=blob.file_size,
        )

        # is there already a mirrored link with this sha1?
        existing_download = Download.objects.filter(sha1=blob.sha1).first()
        if existing_download:
            download.mirror_s3_key = existing_download.mirror_s3_key
        else:
            key_name = blob.sha1[0:2] + '/' + blob.sha1[2:4] + '/' + blob.sha1[4:16] + '/' + clean_filename(blob.filename)
            bucket = open_bucket()
            k = Key(bucket)
            k.key = key_name
            k.set_contents_from_string(blob.file_content)
            download.mirror_s3_key = key_name

        download.save()

        if link.is_zip_file():
            # catalogue the zipfile contents if we don't have them already
            if not ArchiveMember.objects.filter(archive_sha1=blob.sha1).exists():
                z = blob.as_zipfile()
                for info in z.infolist():
                    # zip files do not contain information about the character encoding of filenames.
                    # We therefore decode the filename as iso-8859-1 (an encoding which defines a character
                    # for every byte value) to ensure that it is *some* valid sequence of unicode characters
                    # that can be inserted into the database. When we need to access this zipfile entry
                    # again, we will re-encode it as iso-8859-1 to get back the original byte sequence.
                    ArchiveMember.objects.get_or_create(
                        filename=info.filename.decode('iso-8859-1'),
                        file_size=info.file_size,
                        archive_sha1=blob.sha1)

        return blob
예제 #2
0
def fetch_link(link):
    # Fetch our mirrored copy of the given link if available;
    # if not, mirror and return the original file

    url = link.download_url

    # find last mirrored download
    download = Download.objects.filter(
        link_class=link.link_class, parameter=link.parameter).exclude(
            mirror_s3_key='').order_by('-downloaded_at').first()

    if download:
        # existing download was found; fetch it
        return download.fetch_from_s3()
    else:
        # no mirrored copy exists - fetch and mirror the origin file
        try:
            blob = fetch_origin_url(url)
        except (urllib.error.URLError, FileTooBig) as ex:
            Download.objects.create(downloaded_at=datetime.datetime.now(),
                                    link_class=link.link_class,
                                    parameter=link.parameter,
                                    error_type=ex.__class__.__name__)
            raise

        download = Download(
            downloaded_at=datetime.datetime.now(),
            link_class=link.link_class,
            parameter=link.parameter,
            sha1=blob.sha1,
            md5=blob.md5,
            file_size=blob.file_size,
        )

        # is there already a mirrored link with this sha1?
        existing_download = Download.objects.filter(sha1=blob.sha1).first()
        if existing_download:
            download.mirror_s3_key = existing_download.mirror_s3_key
        else:
            key_name = (blob.sha1[0:2] + '/' + blob.sha1[2:4] + '/' +
                        blob.sha1[4:16] + '/' + clean_filename(blob.filename))
            bucket = open_bucket()
            bucket.put_object(Key=key_name, Body=blob.file_content)
            download.mirror_s3_key = key_name

        download.save()

        if link.is_zip_file():
            # catalogue the zipfile contents if we don't have them already
            if not ArchiveMember.objects.filter(
                    archive_sha1=blob.sha1).exists():
                z = blob.as_zipfile()
                for info in z.infolist():
                    # The Incredible Disaster of Platform Specific Implementations of Zip:
                    # https://gist.github.com/jnalley/cec21bca2d865758bc5e23654df28bd5
                    #
                    # Historically, zip files did not specify what character encoding the filename is using;
                    # there is supposedly a flag to indicate 'yo this is utf-8' but it's unclear how widely
                    # used/recognised it is, and you can bet that scene.org has some weird shit on it.
                    # So, we consider the filename to be an arbitrary byte string.
                    #
                    # Since the database wants to store unicode strings, we decode the byte string as
                    # iso-8859-1 to obtain one, and encode it as iso-8859-1 again on the way out of the
                    # database. iso-8859-1 is chosen because it gives a well-defined result for any
                    # arbitrary byte string, and doesn't unnecessarily mangle pure ASCII filenames.
                    #
                    # So, how do we get a byte string from the result of ZipFile.infolist?
                    # Python 2 gives us a unicode string if the mythical utf-8 flag is set,
                    # and a byte string otherwise. Our old python-2-only code called
                    # filename.decode('iso-8859-1'), which would have failed on a unicode string containing
                    # non-ascii characters, so we can assume that anything that made it as far as the
                    # database originated either as pure ascii or a bytestring. Either way, calling
                    # database_value.encode('iso-8859-1') would give a bytestring that python 2's zipfile
                    # library can accept (i.e. it compares equal to the filename it originally gave us).
                    #
                    # Python 3 ALWAYS gives us a unicode string: decoded as utf-8 if the mythical flag is
                    # set, or decoded as cp437 if not. We don't need to know which of these outcomes
                    # happened; we just need to ensure that
                    # 1) the transformation from unicode string to byte string is reversible, and
                    # 2) the byte string representation matches the one that python 2 would have given us
                    # for the same filename.
                    #
                    # The latter condition is satisfied by filename.encode('cp437'), which makes the
                    # reverse tranformation bytestring.decode('cp437'). Therefore our final algorithm is:
                    #
                    # zipfile to database:
                    # if filename is a unicode string (i.e. we are on py3 or the mythical flag is set):
                    #     filename = filename.encode('cp437')  # filename is now a bytestring
                    # return filename.decode('iso-8859-1')
                    #
                    # database to zipfile:
                    # bytestring = database_value.encode('iso-8859-1')
                    # if we are on py2:
                    #     return bytestring
                    # else:
                    #     return bytestring.decode('cp437')
                    #

                    filename = info.filename
                    if isinstance(filename, str):  # pragma: no cover
                        filename = filename.encode('cp437')
                    filename = filename.decode('iso-8859-1')

                    ArchiveMember.objects.get_or_create(
                        filename=filename,
                        file_size=info.file_size,
                        archive_sha1=blob.sha1)

        return blob
예제 #3
0
def fetch_link(link):
	# Fetch our mirrored copy of the given link if available;
	# if not, mirror and return the original file

	url = link.download_url

	# find last mirrored download
	download = Download.objects.filter(
		link_class=link.link_class, parameter=link.parameter
	).exclude(mirror_s3_key='').order_by('-downloaded_at').first()

	if download:
		# existing download was found; fetch it
		return download.fetch_from_s3()
	else:
		# no mirrored copy exists - fetch and mirror the origin file
		try:
			blob = fetch_origin_url(url)
		except (urllib2.URLError, FileTooBig) as ex:
			Download.objects.create(
				downloaded_at=datetime.datetime.now(),
				link_class=link.link_class,
				parameter=link.parameter,
				error_type=ex.__class__.__name__
			)
			raise

		download = Download(
			downloaded_at=datetime.datetime.now(),
			link_class=link.link_class,
			parameter=link.parameter,
			sha1=blob.sha1,
			md5=blob.md5,
			file_size=blob.file_size,
		)

		# is there already a mirrored link with this sha1?
		existing_download = Download.objects.filter(sha1=blob.sha1).first()
		if existing_download:
			download.mirror_s3_key = existing_download.mirror_s3_key
		else:
			key_name = blob.sha1[0:2] + '/' + blob.sha1[2:4] + '/' + blob.sha1[4:16] + '/' + clean_filename(blob.filename)
			bucket = open_bucket()
			k = Key(bucket)
			k.key = key_name
			k.set_contents_from_string(blob.file_content)
			download.mirror_s3_key = key_name

		download.save()

		if link.is_zip_file():
			# catalogue the zipfile contents if we don't have them already
			if not ArchiveMember.objects.filter(archive_sha1=blob.sha1).exists():
				z = blob.as_zipfile()
				for info in z.infolist():
					# zip files do not contain information about the character encoding of filenames.
					# We therefore decode the filename as iso-8859-1 (an encoding which defines a character
					# for every byte value) to ensure that it is *some* valid sequence of unicode characters
					# that can be inserted into the database. When we need to access this zipfile entry
					# again, we will re-encode it as iso-8859-1 to get back the original byte sequence.
					ArchiveMember.objects.get_or_create(
						filename=info.filename.decode('iso-8859-1'),
						file_size=info.file_size,
						archive_sha1=blob.sha1)

		return blob