def test_write_to_cache_without_date(self): file_path = self.test_cache_path + "aa/bb/aabb-testfile" data = "test-string" self.mixin._write_to_cache(file_path, data) expected_modified_date = localize_datetime(datetime.datetime.now()) file_exists = os.path.exists("%s-%i" % (file_path, datetime_to_unixstamp(expected_modified_date))) self.assertTrue(file_exists, True)
def fetch(self, url, modified_date): modified_date = localize_datetime(str_to_datetime(modified_date)) url_hash = get_sha1_hash(url) base_path = self.base_path(url_hash) try: file_path, latest_version = self._latest_version(base_path) latest_version_path = '%s-%s' % (file_path, latest_version) self._check_path(latest_version_path) except OSError: # File does not exist, download and cache the url data = self._download_file(url) self._write_to_cache(base_path, data, modified_date) return data if modified_date and modified_date > str_to_datetime(latest_version): # If file has been modified download it data = self._download_file(url) self._write_to_cache(base_path, data, modified_date) return data else: # todo force_old_files with open(latest_version_path, 'rb') as f: return f.read()
def fetch(self, url, path, modified_date): modified_date = localize_datetime(str_to_datetime(modified_date)) url_hash = base64.urlsafe_b64encode(path) base_path = self.base_path(url_hash) try: file_path, latest_version = self._latest_version(base_path) latest_version_path = '%s-%s' % (file_path, latest_version) self._check_path(latest_version_path) except OSError: # File does not exist, download and cache the url content_type, content_length, media_file = self.download_url(url) data = media_file.read() self._write_to_cache(base_path, data, modified_date) return content_type, content_length, media_file if modified_date and modified_date > str_to_datetime(latest_version): # If file has been modified download it content_type, content_length, media_file = self.download_url(url) data = media_file.read() self._write_to_cache(base_path, data, modified_date) return content_type, content_length, media_file else: if self.source_definition.get('force_old_files'): with open(latest_version_path, 'rb') as f: f.seek(0, 2) content_length = f.tell() f.seek(0, 0) return None, content_length, f.read() raise ItemAlreadyProcessed("Item %s has already been processed on %s. " "Set 'force_old_files' in source_definition " "to download old files from cache." % (url, latest_version))
def fetch(self, url, path, modified_date): if modified_date: modified_date = localize_datetime(str_to_datetime(modified_date)) else: modified_date = None url_hash = base64.urlsafe_b64encode(path) base_path = self.base_path(url_hash) try: file_path, latest_version = self._latest_version(base_path) latest_version_path = '%s-%s' % (file_path, latest_version) self._check_path(latest_version_path) except OSError: # File does not exist, download and cache the url content_type, content_length, media_file = self.download_url(url) data = media_file.read() # read() iterates over the file to the end, so we have to seek to the beginning to use it again! media_file.seek(0, 0) self._write_to_cache(base_path, data, modified_date) return content_type, content_length, media_file if modified_date and modified_date > str_to_datetime(latest_version): # If file has been modified download it content_type, content_length, media_file = self.download_url(url) data = media_file.read() media_file.seek(0, 0) self._write_to_cache(base_path, data, modified_date) return content_type, content_length, media_file else: if self.source_definition.get('force_old_files'): with open(latest_version_path, 'rb') as f: f.seek(0, 2) content_length = f.tell() f.seek(0, 0) return None, content_length, f.read() raise ItemAlreadyProcessed( "Item %s has already been processed on %s. " "Set 'force_old_files' in source_definition " "to download old files from cache." % (url, latest_version))
def fetch(self, url, path, modified_date): """Fetch a resource url and save it to a path in GCS. The resource will only be downloaded from the source when the file has been modified, otherwise the file will be downloaded from cache if 'force_old_files' has been set. """ bucket = self.get_bucket() blob = bucket.get_blob(path) if not blob: blob = bucket.blob(path) # File does not exist content_type, content_length, media_file = self.download_url(url) data = media_file.read() self.compressed_upload(blob, data, content_type) return content_type, content_length, media_file modified_date = localize_datetime(str_to_datetime(modified_date)) if modified_date > blob.updated: # Upload newer file content_type, content_length, media_file = self.download_url(url) data = media_file.read() self.compressed_upload(blob, data, content_type) return content_type, content_length, media_file elif self.source_definition.get('force_old_files'): # Download up-to-date file media_file = NamedTemporaryFile(dir=TEMP_DIR_PATH) blob.download_to_file(media_file) media_file.seek(0, 0) return blob.content_type, blob.size, media_file raise ItemAlreadyProcessed("Item %s has already been processed on %s. " "Set 'force_old_files' in source_definition " "to download old files from cache." % (url, blob.updated.strftime("%c")))
class LocalCachingMixin(HttpRequestMixin): def base_path(self, file_name): first_dir = file_name[0:2] second_dir = file_name[2:4] return os.path.join( DATA_DIR_PATH, 'cache', self.source_definition['index_name'], first_dir, second_dir, file_name, ) @staticmethod def _latest_version(file_path): version_paths = glob.glob('%s-*' % file_path) if len(version_paths) < 1: raise OSError versions = [ os.path.basename(version_path).rpartition("-")[2] for version_path in version_paths ] latest_version = sorted(versions, reverse=True)[0] return file_path, latest_version, @staticmethod def _check_path(path): file_bytes = os.path.getsize(path) # Raise OSError if the filesize is smaller than two bytes if file_bytes < 2: raise InvalidFile def fetch(self, url, path, modified_date): if modified_date: modified_date = localize_datetime(str_to_datetime(modified_date)) else: modified_date = None url_hash = base64.urlsafe_b64encode(path) base_path = self.base_path(url_hash) try: file_path, latest_version = self._latest_version(base_path) latest_version_path = '%s-%s' % (file_path, latest_version) self._check_path(latest_version_path) except OSError: # File does not exist, download and cache the url content_type, content_length, media_file = self.download_url(url) data = media_file.read() # read() iterates over the file to the end, so we have to seek to the beginning to use it again! media_file.seek(0, 0) self._write_to_cache(base_path, data, modified_date) return content_type, content_length, media_file if modified_date and modified_date > str_to_datetime(latest_version): # If file has been modified download it content_type, content_length, media_file = self.download_url(url) data = media_file.read() media_file.seek(0, 0) self._write_to_cache(base_path, data, modified_date) return content_type, content_length, media_file else: if self.source_definition.get('force_old_files'): with open(latest_version_path, 'rb') as f: f.seek(0, 2) content_length = f.tell() f.seek(0, 0) return None, content_length, f.read() raise ItemAlreadyProcessed( "Item %s has already been processed on %s. " "Set 'force_old_files' in source_definition " "to download old files from cache." % (url, latest_version)) @staticmethod def _write_to_cache(file_path, data, modified_date=None): try: # Create all subdirectories os.makedirs(os.path.dirname(file_path)) except OSError, e: # Reraise if error is not 'File exists' if e.errno != errno.EEXIST: raise e if not modified_date: modified_date = datetime.now() modified_date = datetime_to_unixstamp(localize_datetime(modified_date)) with open('%s-%s' % (file_path, modified_date), 'w') as f: f.write(data)
class HTTPCachingMixin(HttpRequestMixin): source_definition = None def base_path(self, file_name): first_dir = file_name[0:2] second_dir = file_name[2:4] return os.path.join( DATA_DIR_PATH, 'cache', self.source_definition['index_name'], first_dir, second_dir, file_name, ) @staticmethod def _latest_version(file_path): version_paths = glob.glob('%s-*' % file_path) if len(version_paths) < 1: raise OSError versions = [ os.path.basename(version_path).rpartition("-")[2] for version_path in version_paths ] latest_version = sorted(versions, reverse=True)[0] return file_path, latest_version, @staticmethod def _check_path(path): file_bytes = os.path.getsize(path) # Raise OSError if the filesize is smaller than two bytes if file_bytes < 2: raise InvalidFile def fetch(self, url, modified_date): modified_date = localize_datetime(str_to_datetime(modified_date)) url_hash = get_sha1_hash(url) base_path = self.base_path(url_hash) try: file_path, latest_version = self._latest_version(base_path) latest_version_path = '%s-%s' % (file_path, latest_version) self._check_path(latest_version_path) except OSError: # File does not exist, download and cache the url data = self._download_file(url) self._write_to_cache(base_path, data, modified_date) return data if modified_date and modified_date > str_to_datetime(latest_version): # If file has been modified download it data = self._download_file(url) self._write_to_cache(base_path, data, modified_date) return data else: # todo force_old_files with open(latest_version_path, 'rb') as f: return f.read() def _download_file(self, url): resp = self.http_session.get(url) resp.raise_for_status() return resp.content @staticmethod def _write_to_cache(file_path, data, modified_date=None): try: # Create all subdirectories os.makedirs(os.path.dirname(file_path)) except OSError, e: # Reraise if error is not 'File exists' if e.errno != errno.EEXIST: raise e if not modified_date: modified_date = datetime.now() modified_date = datetime_to_unixstamp(localize_datetime(modified_date)) with open('%s-%s' % (file_path, modified_date), 'w') as f: f.write(data)