def begin(self) -> None: if self.have_active_commit(): raise Exception() active_files = {} head = self.get_head() if head != 'root': commit = self.read_commit_index_object(head) active_files = self.flatten_dir_tree( self.read_dir_tree(commit['tree_root'])) # Active commit files stores all of the files which will be in this revision, # including ones carried over from the previous revision sfs.file_put_contents( sfs.cpjoin(self.base_path, 'active_commit_files'), bytes(json.dumps(active_files), encoding='utf8')) # Active commit changes stores a log of files which have been added, changed # or deleted in this revision sfs.file_put_contents( sfs.cpjoin(self.base_path, 'active_commit_changes'), bytes(json.dumps([]), encoding='utf8')) # Store that there is an active commit sfs.file_put_contents(sfs.cpjoin(self.base_path, 'active_commit'), b'true')
def fs_put_from_file(self, source_file: str, file_info) -> None: if not self.have_active_commit(): raise Exception() file_info['hash'] = file_hash = sfs.hash_file(source_file) target_base = sfs.cpjoin(self.base_path, 'files', file_hash[:2]) target = sfs.cpjoin(target_base, file_hash[2:]) if not os.path.isfile(target): # log items which don't already exist so that we do not have to read the objects referenced in # all existing commits to determine if the new objects are garbage in case of a commit roll back self.gc_log_item('file', file_hash) # --- sfs.make_dirs_if_dont_exist(target_base) shutil.move(source_file, target) else: os.remove(source_file) #======================================================= # Update commit changes #======================================================= def helper(contents): file_info['status'] = 'changed' if file_info[ 'path'] in contents else 'new' return contents + [file_info] self.update_system_file('active_commit_changes', helper) #======================================================= # Update commit files #======================================================= def helper2(contents): contents[file_info['path']] = file_info return contents self.update_system_file('active_commit_files', helper2)
def commit(self, commit_message, commit_by, commit_datetime=None) -> str: if not self.have_active_commit(): raise Exception() current_changes = json.loads( sfs.file_get_contents( sfs.cpjoin(self.base_path, 'active_commit_changes'))) active_files = json.loads( sfs.file_get_contents( sfs.cpjoin(self.base_path, 'active_commit_files'))) if current_changes == []: raise Exception('Empty commit') # Create and store the file tree tree_root = self.write_dir_tree(self.build_dir_tree(active_files)) # If no commit message is passed store an indication of what was changed if commit_message == '': new_item = next((change for change in current_changes if change['status'] in ['new', 'changed']), None) deleted_item = next((change for change in current_changes if change['status'] == 'deleted'), None) commit_message = "(Generated message)\n" if new_item is not None: commit_message += new_item['status'] + ' ' + new_item[ 'path'] + '\n' if deleted_item is not None: commit_message += deleted_item[ 'status'] + ' ' + deleted_item['path'] + '\n' if len(current_changes) > 2: commit_message += '...' # Commit timestamp commit_datetime = datetime.utcnow( ) if commit_datetime is None else commit_datetime commit_timestamp = commit_datetime.strftime("%d-%m-%Y %H:%M:%S:%f") # Create commit commit_object_hash = self.write_index_object( 'commit', { 'parent': self.get_head(), 'utc_date_time': commit_timestamp, 'commit_by': commit_by, 'commit_message': commit_message, 'tree_root': tree_root, 'changes': current_changes }) #update head, write plus move for atomicity sfs.file_put_contents(sfs.cpjoin(self.base_path, 'new_head'), bytes(commit_object_hash, encoding='utf8')) os.rename(sfs.cpjoin(self.base_path, 'new_head'), sfs.cpjoin(self.base_path, 'head')) #and clean up working state os.remove(sfs.cpjoin(self.base_path, 'active_commit_changes')) os.remove(sfs.cpjoin(self.base_path, 'active_commit_files')) sfs.ignore(os.remove, sfs.cpjoin(self.base_path, 'gc_log')) os.remove(sfs.cpjoin(self.base_path, 'active_commit')) return commit_object_hash
def test_storage_put_rollback(self): """ Test that file put rolls back correctly """ s = storage(DATA_DIR, CONF_DIR) s.begin() s.file_put_contents('hello', b'test content') s.rollback() self.assertFalse(os.path.isfile(cpjoin(DATA_DIR, 'hello')), msg='File "hello" still exists, put rollback failed') self.assertTrue( os.path.isfile(cpjoin(DATA_DIR, CONF_DIR, BACKUP_DIR, '1_hello')), msg='Backup file "1_hello" does not exist, put rollback failed')
def have_active_commit(self) -> bool: """ Checks if there is an active commit owned by the specified user """ commit_state = sfs.file_or_default( sfs.cpjoin(self.base_path, 'active_commit'), None) if commit_state is not None: return True return False
def with_exclusive_lock(): if not varify_user_lock(repository_path, session_token): return fail(lock_fail_msg) #=== data_store = versioned_storage(repository_path) if not data_store.have_active_commit(): return fail(no_active_commit_msg) # There is no valid reason for path traversal characters to be in a file path within this system file_path = request.headers['path'] if any(True for item in re.split(r'\\|/', file_path) if item in ['..', '.']): return fail() #=== tmp_path = cpjoin(repository_path, 'tmp_file') with open(tmp_path, 'wb') as f: while True: chunk = request.body.read(1000 * 1000) if chunk is None: break f.write(chunk) #=== data_store.fs_put_from_file(tmp_path, {'path': file_path}) # updates the user lock expiry update_user_lock(repository_path, session_token) return success()
def rollback(self) -> None: if not self.have_active_commit(): raise Exception() gc_log_contents: str = sfs.file_or_default( sfs.cpjoin(self.base_path, 'gc_log'), b'').decode('utf8') gc_log_items = [ file_row.split(' ') for file_row in gc_log_contents.splitlines() ] if gc_log_items != []: # If a commit exists and it's hash matches the current head we do not need to do anything # The commit succeeded but we failed before deleting the active commit file for some reason is_commit = next( (item for item in gc_log_items if item[0] == 'commit'), None) if is_commit is not None and is_commit[1] == self.get_head(): pass # commit actually ok else: # commit not ok for item in gc_log_items: # delete the object for this file, noting that it may not exist object_dir = 'files' if item[0] == 'file' else 'index' target_base = sfs.cpjoin(self.base_path, object_dir, item[1][:2]) sfs.ignore(os.remove, sfs.cpjoin(target_base, item[1][2:])) sfs.ignore(os.rmdir, target_base) sfs.ignore(os.remove, sfs.cpjoin(self.base_path, 'active_commit_changes')) sfs.ignore(os.remove, sfs.cpjoin(self.base_path, 'active_commit_files')) sfs.ignore(os.remove, sfs.cpjoin(self.base_path, 'gc_log')) os.remove(sfs.cpjoin(self.base_path, 'active_commit') ) # if this is being called, this file should always exist
def read_user_lock(repository_path: str): try: user_lock = file_get_contents(cpjoin(repository_path, 'user_file')) if user_lock == '': return None return json.loads(user_lock) except IOError: return None except ValueError: return None
def test_storage_move_overwrite_rollback(self): """ Test file move rolls back correctly when move overwrites another file """ s = storage(DATA_DIR, CONF_DIR) s.begin() s.file_put_contents('hello', b'test content') s.file_put_contents('hello2', b'test content 2') s.commit(True) s.move_file('hello', 'hello2') s.rollback() self.assertTrue( os.path.isfile(cpjoin(DATA_DIR, 'hello')), msg='File "hello" does not exist, move overwrite rollback failed') self.assertTrue( os.path.isfile(cpjoin(DATA_DIR, 'hello2')), msg='File "hello2" does not exist, move overwrite rollback failed')
def read_index_object(self, object_hash: str, expected_object_type: str) -> indexObject: index_object: indexObject = json.loads( sfs.file_get_contents( sfs.cpjoin(self.base_path, 'index', object_hash[:2], object_hash[2:]))) if index_object['type'] != expected_object_type: raise IOError('Type of object does not match expected type') return index_object
def write_index_object(self, object_type: str, contents: Dict[str, Any]) -> str: new_object: indexObject = {'type': object_type} new_object.update(contents) #type: ignore serialised = json.dumps(new_object) object_hash = hashlib.sha256(bytes(serialised, encoding='utf8')).hexdigest() target_base = sfs.cpjoin(self.base_path, 'index', object_hash[:2]) if os.path.isfile(sfs.cpjoin(target_base, object_hash[2:])): return object_hash # log items which do not exist for garbage collection self.gc_log_item(object_type, object_hash) #---- sfs.make_dirs_if_dont_exist(target_base) sfs.file_put_contents(sfs.cpjoin(target_base, object_hash[2:]), bytes(serialised, encoding='utf8')) return object_hash
def update_user_lock(repository_path: str, session_token: bytes): """ Write or clear the user lock file """ # NOTE ALWAYS use within lock access callback # While the user lock file should ALWAYS be written only within a lock_access # callback, it is sometimes read asynchronously. Because of this updates to # the file must be atomic. Write plus move is used to achieve this. real_path: str = cpjoin(repository_path, 'user_file') tmp_path: str = cpjoin(repository_path, 'new_user_file') with open(tmp_path, 'w') as fd2: if session_token is None: fd2.write('') else: fd2.write( json.dumps({ 'session_token': session_token.decode('utf8'), 'expires': int(time.time()) + 30 })) fd2.flush() os.rename(tmp_path, real_path)
def lock_access(repository_path: str, callback: Callable[[], Responce]): """ Synchronise access to the user file between processes, this specifies which user is allowed write access at the current time """ with open(cpjoin(repository_path, 'lock_file'), 'w') as fd: try: fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) returned = callback() fcntl.flock(fd, fcntl.LOCK_UN) return returned except IOError: return fail(lock_fail_msg)
def init(unlocked=False): global data_store, server_connection, config try: config = json.loads( file_get_contents( cpjoin(working_copy_base_path, '.shttpfs', 'client_configuration.json'))) except IOError: raise SystemExit('No shttpfs configuration found') except ValueError: raise SystemExit('Configuration file syntax error') # Lock for sanity check, only one client can use the working copy at any time try: lockfile = open( cpjoin(working_copy_base_path, '.shttpfs', 'lock_file'), 'w') fcntl.flock(lockfile, fcntl.LOCK_EX | fcntl.LOCK_NB) except IOError: raise SystemExit('Could not lock working copy') #----------- ignore_filters: str = file_or_default( cpjoin(working_copy_base_path, '.shttpfs_ignore'), b'').decode('utf8') pull_ignore_filters: str = file_or_default( cpjoin(working_copy_base_path, '.shttpfs_pull_ignore'), b'').decode('utf8') #----------- config['ignore_filters']: List[str] = ['/.shttpfs*' ] + ignore_filters.splitlines() config['pull_ignore_filters']: List[str] = pull_ignore_filters.splitlines() config['data_dir']: str = working_copy_base_path if not unlocked: config["private_key"] = crypto.unlock_private_key( config["private_key"]) data_store = plain_storage(config['data_dir']) server_connection = client_http_request(config['server_domain'])
def varify_user_lock(repository_path: str, session_token: bytes): """ Verify that a returning user has a valid token and their lock has not expired """ with open(cpjoin(repository_path, 'user_file'), 'r') as fd2: content = fd2.read() if len(content) == 0: return False try: res = json.loads(content) except ValueError: return False return res['session_token'].encode('utf8') == session_token and int( time.time()) < int(res['expires']) return False
def test_storage_delete_rollback(self): """ Test file delete rolls back correctly """ s = storage(DATA_DIR, CONF_DIR) s.begin() s.file_put_contents('hello', b'test content') s.commit(True) s.delete_file('hello') s.rollback() self.assertTrue( os.path.isfile(cpjoin(DATA_DIR, 'hello')), msg='error, file "hello" does not exist, delete rollback failed')
def test_storage_move_rollback(self): """ Test file move rolls back correctly """ s = storage(DATA_DIR, CONF_DIR) s.begin() s.file_put_contents('hello', b'test content') s.commit(True) s.move_file('hello', 'hello2') s.rollback() self.assertFalse( os.path.isfile(cpjoin(DATA_DIR, 'hello2')), msg='File "hello2" still exists, move rollback failed')
def commit(self, cont: bool = False): """ Finish a transaction """ if self.journal is None: raise Exception('Must call begin first') self.journal.close() # type: ignore self.journal = None os.remove(self.j_file) for itm in os.listdir(self.tmp_dir): os.remove(cpjoin(self.tmp_dir, itm)) if cont is True: self.begin()
def test_hash_file(self): """ Test that file hash returns the correct result. """ make_data_dir() file_path = cpjoin(DATA_DIR, 'test') file_put_contents(file_path, b'some file contents') expected_result = 'cf57fcf9d6d7fb8fd7d8c30527c8f51026aa1d99ad77cc769dd0c757d4fe8667' result = hash_file(file_path) self.assertEqual(expected_result, result, msg='Hashes are not the same') delete_data_dir()
def can_aquire_user_lock(repository_path: str, session_token: bytes): """ Allow a user to acquire the lock if no other user is currently using it, if the original user is returning, presumably after a network error, or if the lock has expired. """ # NOTE ALWAYS use within lock access callback user_file_path: str = cpjoin(repository_path, 'user_file') if not os.path.isfile(user_file_path): return True with open(user_file_path, 'r') as fd2: content: str = fd2.read() if len(content) == 0: return True try: res = json.loads(content) except ValueError: return True if res['expires'] < int(time.time()): return True elif res['session_token'] == session_token: return True return False
def have_authenticated_user(client_ip: str, repository: str, session_token: bytes): """ check user submitted session token against the db and that ip has not changed """ if repository not in config['repositories']: return False repository_path = config['repositories'][repository]['path'] conn = auth_db_connect(cpjoin(repository_path, 'auth_transient.db')) # Garbage collect session tokens. We must not garbage collect the authentication token of the client # which is currently doing a commit. Large files can take a long time to upload and during this time, # the locks expiration is not being updated thus can expire. This is a problem here as session tokens # table is garbage collected every time a user authenticates. It does not matter if the user_lock # expires while the client also holds the flock, as it is updated to be in the future at the end of # the current operation. We exclude any tokens owned by the client which currently owns the user # lock for this reason. user_lock = read_user_lock(repository_path) active_commit = user_lock[ 'session_token'] if user_lock is not None else None if active_commit is not None: conn.execute( "delete from session_tokens where expires < ? and token != ?", (time.time(), active_commit)) else: conn.execute("delete from session_tokens where expires < ?", (time.time(), )) # Get the session token res = conn.execute( "select * from session_tokens where token = ? and ip = ?", (session_token, client_ip)).fetchall() if res != [] and repository in config['users'][ res[0]['username']]['uses_repositories']: conn.execute( "update session_tokens set expires = ? where token = ? and ip = ?", (time.time() + extend_session_duration, session_token, client_ip)) conn.commit() # to make sure the update and delete have the same view return res[0] conn.commit() return False
def begin_auth(request: Request) -> Responce: """ Request authentication token to sign """ repository = request.headers['repository'] if repository not in config['repositories']: return fail(no_such_repo_msg) # == repository_path = config['repositories'][repository]['path'] conn = auth_db_connect(cpjoin(repository_path, 'auth_transient.db')) gc_tokens(conn) # Issue a new token auth_token = base64.b64encode(pysodium.randombytes(35)).decode('utf-8') conn.execute("insert into tokens (expires, token, ip) values (?,?,?)", (time.time() + 30, auth_token, request.remote_addr)) conn.commit() return success({'auth_token': auth_token})
def test_get_changes_since(self): file_put_contents(cpjoin(DATA_DIR, 'test 1'), b'test') file_put_contents(cpjoin(DATA_DIR, 'test 2'), b'test 1') file_put_contents(cpjoin(DATA_DIR, 'test 3'), b'test 2') #================== data_store = versioned_storage(DATA_DIR) data_store.begin() data_store.fs_put_from_file(cpjoin(DATA_DIR, 'test 1'), {'path': '/test/path'}) id1 = data_store.commit('test msg', 'test user') changes = data_store.get_changes_since('root', data_store.get_head()) self.assertEqual( changes, { '/test/path': { 'hash': '9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08', 'path': '/test/path', 'status': 'new' } }) #================== data_store.begin() data_store.fs_put_from_file(cpjoin(DATA_DIR, 'test 2'), {'path': '/another/path'}) data_store.fs_put_from_file(cpjoin(DATA_DIR, 'test 3'), {'path': '/yet/another/path'}) data_store.commit('test msg', 'test user') changes = data_store.get_changes_since(id1, data_store.get_head()) self.assertEqual( changes, { '/another/path': { 'hash': 'f67213b122a5d442d2b93bda8cc45c564a70ec5d2a4e0e95bb585cf199869c98', 'path': '/another/path', 'status': 'new' }, '/yet/another/path': { 'hash': 'dec2e4bc4992314a9c9a51bbd859e1b081b74178818c53c19d18d6f761f5d804', 'path': '/yet/another/path', 'status': 'new' } })
def pull_file(request: Request) -> Responce: """ Get a file from the server """ session_token = request.headers['session_token'].encode('utf8') repository = request.headers['repository'] #=== current_user = have_authenticated_user(request.remote_addr, repository, session_token) if current_user is False: return fail(user_auth_fail_msg) #=== data_store = versioned_storage(config['repositories'][repository]['path']) file_info = data_store.get_file_info_from_path(request.headers['path']) full_file_path: str = cpjoin( data_store.get_file_directory_path(file_info['hash']), file_info['hash'][2:]) return success({'file_info_json': json.dumps(file_info)}, ServeFile(full_file_path))
def test_rollback(self): file_put_contents(cpjoin(DATA_DIR, 'test 1'), b'test') file_put_contents(cpjoin(DATA_DIR, 'test 2'), b'test') file_put_contents(cpjoin(DATA_DIR, 'test 3'), b'test 2') #================== data_store = versioned_storage(DATA_DIR) data_store.begin() data_store.fs_put_from_file(cpjoin(DATA_DIR, 'test 1'), {'path': '/test/path'}) data_store.commit('test msg', 'test user') data_store.begin() data_store.fs_put_from_file(cpjoin(DATA_DIR, 'test 2'), {'path': '/another/path'}) data_store.fs_put_from_file(cpjoin(DATA_DIR, 'test 3'), {'path': '/yet/another/path'}) data_store.rollback() self.assertEqual(os.listdir(cpjoin(DATA_DIR, 'files')), ['9f']) self.assertEqual( os.listdir(cpjoin(DATA_DIR, 'files', '9f')), ['86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08'])
def test_storage_multiple_rollback(self): """ Test rollback of multiple things at once """ s = storage(DATA_DIR, CONF_DIR) s.begin() s.file_put_contents('hello', b'test content') s.commit(True) s.file_put_contents('hello2', b'test content 2') s.file_put_contents('hello3', b'test content 3') s.move_file('hello', 'goodbye') s.move_file('hello2', 'hello3') s.delete_file('hello3') s.file_put_contents('hello3', b'something else') s.rollback() self.assertTrue( os.path.isfile(cpjoin(DATA_DIR, 'hello')), msg='File "hello" does not exist, multiple rollback failed') self.assertFalse( os.path.isfile(cpjoin(DATA_DIR, 'hello3')), msg='File "hello3" still exists, multiple rollback failed') self.assertFalse( os.path.isfile(cpjoin(DATA_DIR, 'goodbye')), msg='File "goodbye" still exists, multiple rollback failed') self.assertTrue( os.path.isfile(cpjoin(DATA_DIR, CONF_DIR, BACKUP_DIR, '1_hello3')), msg= 'Backup file "1_hello3" does not exist, multiple rollback failed') self.assertTrue( os.path.isfile(cpjoin(DATA_DIR, CONF_DIR, BACKUP_DIR, '2_hello3')), msg= 'Backup file "2_hello3" does not exist, multiple rollback failed') self.assertTrue( os.path.isfile(cpjoin(DATA_DIR, CONF_DIR, BACKUP_DIR, '3_hello2')), msg= 'Backup file "3_hello2" does not exist, multiple rollback failed')
def __init__(self, data_dir): """ Setup and validate file system structure """ storage.__init__(self, data_dir, '.shttpfs') self.manifest_file = cpjoin('.shttpfs', 'manifest.json')
def get_full_file_path(self, *args): """ make path relative to DATA DIR from a system relative path """ return cpjoin(self.data_dir, *args)
def get_file_directory_path(self, file_hash: str) -> str: return sfs.cpjoin(self.base_path, 'files', file_hash[:2])
def gc_log_item(self, item_type: str, item_hash: str) -> None: with open(sfs.cpjoin(self.base_path, 'gc_log'), 'a') as gc_log: gc_log.write(item_type + ' ' + item_hash + '\n') gc_log.flush()