Пример #1
0
    def fetch(self, url, modified_date):
        modified_date = localize_datetime(str_to_datetime(modified_date))

        url_hash = get_sha1_hash(url)
        base_path = self.base_path(url_hash)

        try:
            file_path, latest_version = self._latest_version(base_path)
            latest_version_path = '%s-%s' % (file_path, latest_version)
            self._check_path(latest_version_path)
        except OSError:
            # File does not exist, download and cache the url
            data = self._download_file(url)
            self._write_to_cache(base_path, data, modified_date)
            return data

        if modified_date and modified_date > str_to_datetime(latest_version):
            # If file has been modified download it
            data = self._download_file(url)
            self._write_to_cache(base_path, data, modified_date)
            return data
        else:
            # todo force_old_files
            with open(latest_version_path, 'rb') as f:
                return f.read()
Пример #2
0
    def fetch(self, url, path, modified_date):
        modified_date = localize_datetime(str_to_datetime(modified_date))

        url_hash = base64.urlsafe_b64encode(path)
        base_path = self.base_path(url_hash)

        try:
            file_path, latest_version = self._latest_version(base_path)
            latest_version_path = '%s-%s' % (file_path, latest_version)
            self._check_path(latest_version_path)
        except OSError:
            # File does not exist, download and cache the url
            content_type, content_length, media_file = self.download_url(url)
            data = media_file.read()
            self._write_to_cache(base_path, data, modified_date)
            return content_type, content_length, media_file

        if modified_date and modified_date > str_to_datetime(latest_version):
            # If file has been modified download it
            content_type, content_length, media_file = self.download_url(url)
            data = media_file.read()
            self._write_to_cache(base_path, data, modified_date)
            return content_type, content_length, media_file
        else:
            if self.source_definition.get('force_old_files'):
                with open(latest_version_path, 'rb') as f:
                    f.seek(0, 2)
                    content_length = f.tell()
                    f.seek(0, 0)
                    return None, content_length, f.read()

        raise ItemAlreadyProcessed("Item %s has already been processed on %s. "
                                   "Set 'force_old_files' in source_definition "
                                   "to download old files from cache." %
                                   (url, latest_version))
Пример #3
0
    def fetch(self, url, path, modified_date):
        if modified_date:
            modified_date = localize_datetime(str_to_datetime(modified_date))
        else:
            modified_date = None

        url_hash = base64.urlsafe_b64encode(path)
        base_path = self.base_path(url_hash)

        try:
            file_path, latest_version = self._latest_version(base_path)
            latest_version_path = '%s-%s' % (file_path, latest_version)
            self._check_path(latest_version_path)
        except OSError:
            # File does not exist, download and cache the url
            content_type, content_length, media_file = self.download_url(url)
            data = media_file.read()
            # read() iterates over the file to the end, so we have to seek to the beginning to use it again!
            media_file.seek(0, 0)
            self._write_to_cache(base_path, data, modified_date)
            return content_type, content_length, media_file

        if modified_date and modified_date > str_to_datetime(latest_version):
            # If file has been modified download it
            content_type, content_length, media_file = self.download_url(url)
            data = media_file.read()
            media_file.seek(0, 0)
            self._write_to_cache(base_path, data, modified_date)
            return content_type, content_length, media_file
        else:
            if self.source_definition.get('force_old_files'):
                with open(latest_version_path, 'rb') as f:
                    f.seek(0, 2)
                    content_length = f.tell()
                    f.seek(0, 0)
                    return None, content_length, f.read()

        raise ItemAlreadyProcessed(
            "Item %s has already been processed on %s. "
            "Set 'force_old_files' in source_definition "
            "to download old files from cache." % (url, latest_version))
Пример #4
0
    def fetch(self, url, path, modified_date):
        """Fetch a resource url and save it to a path in GCS. The resource will
        only be downloaded from the source when the file has been modified,
        otherwise the file will be downloaded from cache if 'force_old_files'
        has been set.
        """

        bucket = self.get_bucket()
        blob = bucket.get_blob(path)
        if not blob:
            blob = bucket.blob(path)

            # File does not exist
            content_type, content_length, media_file = self.download_url(url)
            data = media_file.read()
            self.compressed_upload(blob, data, content_type)
            return content_type, content_length, media_file

        modified_date = localize_datetime(str_to_datetime(modified_date))
        if modified_date > blob.updated:
            # Upload newer file
            content_type, content_length, media_file = self.download_url(url)
            data = media_file.read()
            self.compressed_upload(blob, data, content_type)
            return content_type, content_length, media_file
        elif self.source_definition.get('force_old_files'):
            # Download up-to-date file
            media_file = NamedTemporaryFile(dir=TEMP_DIR_PATH)
            blob.download_to_file(media_file)
            media_file.seek(0, 0)
            return blob.content_type, blob.size, media_file

        raise ItemAlreadyProcessed("Item %s has already been processed on %s. "
                                   "Set 'force_old_files' in source_definition "
                                   "to download old files from cache." %
                                   (url, blob.updated.strftime("%c")))
Пример #5
0
 def sanitize(value):
     """Strip the value of spaces and make it unicode"""
     if value:
         return str_to_datetime(value)