def fetch(self, url, modified_date): modified_date = localize_datetime(str_to_datetime(modified_date)) url_hash = get_sha1_hash(url) base_path = self.base_path(url_hash) try: file_path, latest_version = self._latest_version(base_path) latest_version_path = '%s-%s' % (file_path, latest_version) self._check_path(latest_version_path) except OSError: # File does not exist, download and cache the url data = self._download_file(url) self._write_to_cache(base_path, data, modified_date) return data if modified_date and modified_date > str_to_datetime(latest_version): # If file has been modified download it data = self._download_file(url) self._write_to_cache(base_path, data, modified_date) return data else: # todo force_old_files with open(latest_version_path, 'rb') as f: return f.read()
def fetch(self, url, path, modified_date): modified_date = localize_datetime(str_to_datetime(modified_date)) url_hash = base64.urlsafe_b64encode(path) base_path = self.base_path(url_hash) try: file_path, latest_version = self._latest_version(base_path) latest_version_path = '%s-%s' % (file_path, latest_version) self._check_path(latest_version_path) except OSError: # File does not exist, download and cache the url content_type, content_length, media_file = self.download_url(url) data = media_file.read() self._write_to_cache(base_path, data, modified_date) return content_type, content_length, media_file if modified_date and modified_date > str_to_datetime(latest_version): # If file has been modified download it content_type, content_length, media_file = self.download_url(url) data = media_file.read() self._write_to_cache(base_path, data, modified_date) return content_type, content_length, media_file else: if self.source_definition.get('force_old_files'): with open(latest_version_path, 'rb') as f: f.seek(0, 2) content_length = f.tell() f.seek(0, 0) return None, content_length, f.read() raise ItemAlreadyProcessed("Item %s has already been processed on %s. " "Set 'force_old_files' in source_definition " "to download old files from cache." % (url, latest_version))
def fetch(self, url, path, modified_date): if modified_date: modified_date = localize_datetime(str_to_datetime(modified_date)) else: modified_date = None url_hash = base64.urlsafe_b64encode(path) base_path = self.base_path(url_hash) try: file_path, latest_version = self._latest_version(base_path) latest_version_path = '%s-%s' % (file_path, latest_version) self._check_path(latest_version_path) except OSError: # File does not exist, download and cache the url content_type, content_length, media_file = self.download_url(url) data = media_file.read() # read() iterates over the file to the end, so we have to seek to the beginning to use it again! media_file.seek(0, 0) self._write_to_cache(base_path, data, modified_date) return content_type, content_length, media_file if modified_date and modified_date > str_to_datetime(latest_version): # If file has been modified download it content_type, content_length, media_file = self.download_url(url) data = media_file.read() media_file.seek(0, 0) self._write_to_cache(base_path, data, modified_date) return content_type, content_length, media_file else: if self.source_definition.get('force_old_files'): with open(latest_version_path, 'rb') as f: f.seek(0, 2) content_length = f.tell() f.seek(0, 0) return None, content_length, f.read() raise ItemAlreadyProcessed( "Item %s has already been processed on %s. " "Set 'force_old_files' in source_definition " "to download old files from cache." % (url, latest_version))
def fetch(self, url, path, modified_date): """Fetch a resource url and save it to a path in GCS. The resource will only be downloaded from the source when the file has been modified, otherwise the file will be downloaded from cache if 'force_old_files' has been set. """ bucket = self.get_bucket() blob = bucket.get_blob(path) if not blob: blob = bucket.blob(path) # File does not exist content_type, content_length, media_file = self.download_url(url) data = media_file.read() self.compressed_upload(blob, data, content_type) return content_type, content_length, media_file modified_date = localize_datetime(str_to_datetime(modified_date)) if modified_date > blob.updated: # Upload newer file content_type, content_length, media_file = self.download_url(url) data = media_file.read() self.compressed_upload(blob, data, content_type) return content_type, content_length, media_file elif self.source_definition.get('force_old_files'): # Download up-to-date file media_file = NamedTemporaryFile(dir=TEMP_DIR_PATH) blob.download_to_file(media_file) media_file.seek(0, 0) return blob.content_type, blob.size, media_file raise ItemAlreadyProcessed("Item %s has already been processed on %s. " "Set 'force_old_files' in source_definition " "to download old files from cache." % (url, blob.updated.strftime("%c")))
def sanitize(value): """Strip the value of spaces and make it unicode""" if value: return str_to_datetime(value)