class TestLocalFileSystem(TestDir): def setUp(self): super().setUp() self.fs = LocalFileSystem(None, {}) def test_open(self): with self.fs.open(self.FOO) as fd: self.assertEqual(fd.read(), self.FOO_CONTENTS) with self.fs.open(self.UNICODE, encoding="utf-8") as fd: self.assertEqual(fd.read(), self.UNICODE_CONTENTS) def test_exists(self): self.assertTrue(self.fs.exists(self.FOO)) self.assertTrue(self.fs.exists(self.UNICODE)) self.assertFalse(self.fs.exists("not-existing-file")) def test_isdir(self): self.assertTrue(self.fs.isdir(self.DATA_DIR)) self.assertFalse(self.fs.isdir(self.FOO)) self.assertFalse(self.fs.isdir("not-existing-file")) def test_isfile(self): self.assertTrue(self.fs.isfile(self.FOO)) self.assertFalse(self.fs.isfile(self.DATA_DIR)) self.assertFalse(self.fs.isfile("not-existing-file"))
def test_staging_file(tmp_dir, dvc): from dvc.objects import check from dvc.objects.stage import stage from dvc.objects.transfer import transfer tmp_dir.gen("foo", "foo") fs = LocalFileSystem() local_odb = dvc.odb.local staging_odb, obj = stage(local_odb, tmp_dir / "foo", fs, "md5") assert not local_odb.exists(obj.hash_info) assert staging_odb.exists(obj.hash_info) with pytest.raises(FileNotFoundError): check(local_odb, obj) check(staging_odb, obj) transfer(staging_odb, local_odb, {obj.hash_info}, move=True) check(local_odb, obj) with pytest.raises(FileNotFoundError): check(staging_odb, obj) path_info = local_odb.hash_to_path_info(obj.hash_info.value) assert fs.exists(path_info)
def test_staging_dir(tmp_dir, dvc): from dvc.data import check from dvc.data.stage import stage from dvc.data.transfer import transfer tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) fs = LocalFileSystem() local_odb = dvc.odb.local staging_odb, _, obj = stage(local_odb, (tmp_dir / "dir").fs_path, fs, "md5") assert not local_odb.exists(obj.hash_info) assert staging_odb.exists(obj.hash_info) with pytest.raises(FileNotFoundError): check(local_odb, obj) check(staging_odb, obj) transfer(staging_odb, local_odb, {obj.hash_info}, shallow=False, hardlink=True) check(local_odb, obj) check(staging_odb, obj) path = local_odb.hash_to_path(obj.hash_info.value) assert fs.exists(path)
def test_local_fs_exists(tmp_dir): tmp_dir.gen({ "foo": "foo", "bar": "bar", "тест": "проверка", "code.py": "import sys\nimport shutil\n" "shutil.copyfile(sys.argv[1], sys.argv[2])", "data_dir": { "data": "data", "data_sub_dir": { "data_sub": "data_sub" }, }, }) fs = LocalFileSystem() assert fs.exists("foo") assert fs.exists("тест") assert not fs.exists("not-existing-file")
class State(StateBase): # pylint: disable=too-many-instance-attributes def __init__(self, root_dir=None, tmp_dir=None): from diskcache import Cache super().__init__() self.tmp_dir = tmp_dir self.root_dir = root_dir self.fs = LocalFileSystem(None, {"url": self.root_dir}) if not tmp_dir: return config = {"eviction_policy": "least-recently-used"} self.links = Cache(directory=os.path.join(tmp_dir, "links"), **config) self.md5s = Cache(directory=os.path.join(tmp_dir, "md5s"), **config) def close(self): self.md5s.close() self.links.close() def save(self, path_info, fs, hash_info): """Save hash for the specified path info. Args: path_info (dict): path_info to save hash for. hash_info (HashInfo): hash to save. """ if not isinstance(fs, LocalFileSystem): return assert isinstance(path_info, str) or path_info.scheme == "local" assert hash_info assert isinstance(hash_info, HashInfo) assert os.path.exists(path_info) mtime, size = get_mtime_and_size(path_info, self.fs) inode = get_inode(path_info) logger.debug("state save (%s, %s, %s) %s", inode, mtime, size, hash_info.value) self.md5s[inode] = (mtime, size, hash_info.value) def get(self, path_info, fs): """Gets the hash for the specified path info. Hash will be retrieved from the state database if available. Args: path_info (dict): path info to get the hash for. Returns: HashInfo or None: hash for the specified path info or None if it doesn't exist in the state database. """ if not isinstance(fs, LocalFileSystem): return None assert isinstance(path_info, str) or path_info.scheme == "local" path = os.fspath(path_info) # NOTE: use os.path.exists instead of LocalFileSystem.exists # because it uses lexists() and will return True for broken # symlinks that we cannot stat() in get_mtime_and_size if not os.path.exists(path): return None mtime, size = get_mtime_and_size(path, self.fs) inode = get_inode(path) value = self.md5s.get(inode) if not value or value[0] != mtime or value[1] != size: return None return HashInfo("md5", value[2], size=int(size)) def save_link(self, path_info, fs): """Adds the specified path to the list of links created by dvc. This list is later used on `dvc checkout` to cleanup old links. Args: path_info (dict): path info to add to the list of links. """ if not isinstance(fs, LocalFileSystem): return assert isinstance(path_info, str) or path_info.scheme == "local" if not self.fs.exists(path_info): return mtime, _ = get_mtime_and_size(path_info, self.fs) inode = get_inode(path_info) relative_path = relpath(path_info, self.root_dir) with self.links as ref: ref[relative_path] = (inode, mtime) def get_unused_links(self, used, fs): """Removes all saved links except the ones that are used. Args: used (list): list of used links that should not be removed. """ if not isinstance(fs, LocalFileSystem): return unused = [] with self.links as ref: for relative_path in ref: path = os.path.join(self.root_dir, relative_path) if path in used or not self.fs.exists(path): continue inode = get_inode(path) mtime, _ = get_mtime_and_size(path, self.fs) if ref[relative_path] == (inode, mtime): logger.debug("Removing '%s' as unused link.", path) unused.append(relative_path) return unused def remove_links(self, unused, fs): if not isinstance(fs, LocalFileSystem): return for path in unused: remove(os.path.join(self.root_dir, path)) with self.links as ref: for path in unused: del ref[path]
class State(StateBase): # pylint: disable=too-many-instance-attributes """Class for the state database. Args: repo (dvc.repo.Repo): repo instance that this state belongs to. config (configobj.ConfigObj): config for the state. Raises: StateVersionTooNewError: thrown when dvc version is older than the state database version. """ VERSION = 3 STATE_FILE = "state" STATE_TABLE = "state" STATE_TABLE_LAYOUT = ( "inode INTEGER PRIMARY KEY, " "mtime TEXT NOT NULL, " "size TEXT NOT NULL, " "md5 TEXT NOT NULL, " "timestamp TEXT NOT NULL" ) STATE_INFO_TABLE = "state_info" STATE_INFO_TABLE_LAYOUT = "count INTEGER" STATE_INFO_ROW = 1 LINK_STATE_TABLE = "link_state" LINK_STATE_TABLE_LAYOUT = ( "path TEXT PRIMARY KEY, " "inode INTEGER NOT NULL, " "mtime TEXT NOT NULL" ) STATE_ROW_LIMIT = 100000000 STATE_ROW_CLEANUP_QUOTA = 50 MAX_INT = 2 ** 63 - 1 MAX_UINT = 2 ** 64 - 2 def __init__(self, repo): super().__init__() self.repo = repo self.root_dir = repo.root_dir self.fs = LocalFileSystem(None, {"url": self.root_dir}) state_config = repo.config.get("state", {}) self.row_limit = state_config.get("row_limit", self.STATE_ROW_LIMIT) self.row_cleanup_quota = state_config.get( "row_cleanup_quota", self.STATE_ROW_CLEANUP_QUOTA ) if not repo.tmp_dir: self.state_file = None return self.state_file = os.path.join(repo.tmp_dir, self.STATE_FILE) self.database = None self.cursor = None self.inserts = 0 def _execute(self, cmd, parameters=()): logger.trace(cmd) return self.cursor.execute(cmd, parameters) def _fetchall(self): ret = self.cursor.fetchall() logger.trace("fetched: %s", ret) return ret def _to_sqlite(self, num): assert num >= 0 assert num < self.MAX_UINT # NOTE: sqlite stores unit as signed ints, so maximum uint is 2^63-1 # see http://jakegoulding.com/blog/2011/02/06/sqlite-64-bit-integers/ if num > self.MAX_INT: ret = -(num - self.MAX_INT) else: ret = num assert self._from_sqlite(ret) == num return ret def _from_sqlite(self, num): assert abs(num) <= self.MAX_INT if num < 0: return abs(num) + self.MAX_INT assert num < self.MAX_UINT assert num >= 0 return num def _prepare_db(self, empty=False): from dvc import __version__ if not empty: cmd = "PRAGMA user_version;" self._execute(cmd) ret = self._fetchall() assert len(ret) == 1 assert len(ret[0]) == 1 assert isinstance(ret[0][0], int) version = ret[0][0] if version > self.VERSION: raise StateVersionTooNewError( __version__, self.VERSION, version ) elif version < self.VERSION: logger.warning( "State file version '%d' is too old. " "Reformatting to the current version '%d'.", version, self.VERSION, ) cmd = "DROP TABLE IF EXISTS {};" self._execute(cmd.format(self.STATE_TABLE)) self._execute(cmd.format(self.STATE_INFO_TABLE)) self._execute(cmd.format(self.LINK_STATE_TABLE)) # Check that the state file is indeed a database cmd = "CREATE TABLE IF NOT EXISTS {} ({})" self._execute(cmd.format(self.STATE_TABLE, self.STATE_TABLE_LAYOUT)) self._execute( cmd.format(self.STATE_INFO_TABLE, self.STATE_INFO_TABLE_LAYOUT) ) self._execute( cmd.format(self.LINK_STATE_TABLE, self.LINK_STATE_TABLE_LAYOUT) ) cmd = ( "INSERT OR IGNORE INTO {} (count) SELECT 0 " "WHERE NOT EXISTS (SELECT * FROM {})" ) self._execute(cmd.format(self.STATE_INFO_TABLE, self.STATE_INFO_TABLE)) cmd = "PRAGMA user_version = {};" self._execute(cmd.format(self.VERSION)) def load(self): """Loads state database.""" retries = 1 while True: assert self.database is None assert self.cursor is None assert self.inserts == 0 empty = not os.path.exists(self.state_file) # NOTE: we use nolock option because fcntl() lock sqlite uses # doesn't work on some older NFS/CIFS filesystems. # This opens a possibility of data corruption by concurrent writes, # which is prevented by repo lock. self.database = _connect_sqlite(self.state_file, {"nolock": 1}) self.cursor = self.database.cursor() from sqlite3 import DatabaseError # Try loading once to check that the file is indeed a database # and reformat it if it is not. try: self._prepare_db(empty=empty) return except DatabaseError: self.cursor.close() self.database.close() self.database = None self.cursor = None self.inserts = 0 if retries > 0: os.unlink(self.state_file) retries -= 1 else: raise def _vacuum(self): # NOTE: see https://bugs.python.org/issue28518 self.database.isolation_level = None self._execute("VACUUM") self.database.isolation_level = "" def dump(self): """Saves state database.""" assert self.database is not None cmd = "SELECT count from {} WHERE rowid=?".format( self.STATE_INFO_TABLE ) self._execute(cmd, (self.STATE_INFO_ROW,)) ret = self._fetchall() assert len(ret) == 1 assert len(ret[0]) == 1 count = self._from_sqlite(ret[0][0]) + self.inserts if count > self.row_limit: msg = "cleaning up state, this might take a while." logger.warning(msg) delete = count - self.row_limit delete += int(self.row_limit * (self.row_cleanup_quota / 100.0)) cmd = ( "DELETE FROM {} WHERE timestamp IN (" "SELECT timestamp FROM {} ORDER BY timestamp ASC LIMIT {});" ) self._execute( cmd.format(self.STATE_TABLE, self.STATE_TABLE, delete) ) self._vacuum() cmd = "SELECT COUNT(*) FROM {}" self._execute(cmd.format(self.STATE_TABLE)) ret = self._fetchall() assert len(ret) == 1 assert len(ret[0]) == 1 count = ret[0][0] cmd = "UPDATE {} SET count = ? WHERE rowid = ?".format( self.STATE_INFO_TABLE ) self._execute(cmd, (self._to_sqlite(count), self.STATE_INFO_ROW)) self.database.commit() self.cursor.close() self.database.close() self.database = None self.cursor = None self.inserts = 0 @staticmethod def _file_metadata_changed(actual_mtime, mtime, actual_size, size): return actual_mtime != mtime or actual_size != size def _update_state_record_timestamp_for_inode(self, actual_inode): cmd = "UPDATE {} SET timestamp = ? WHERE inode = ?".format( self.STATE_TABLE ) self._execute( cmd, (current_timestamp(), self._to_sqlite(actual_inode)) ) def _update_state_for_path_changed( self, actual_inode, actual_mtime, actual_size, checksum ): cmd = ( "UPDATE {} SET " "mtime = ?, size = ?, " "md5 = ?, timestamp = ? " "WHERE inode = ?" ).format(self.STATE_TABLE) self._execute( cmd, ( actual_mtime, actual_size, checksum, current_timestamp(), self._to_sqlite(actual_inode), ), ) def _insert_new_state_record( self, actual_inode, actual_mtime, actual_size, checksum ): assert checksum is not None cmd = ( "INSERT INTO {}(inode, mtime, size, md5, timestamp) " "VALUES (?, ?, ?, ?, ?)" ).format(self.STATE_TABLE) self._execute( cmd, ( self._to_sqlite(actual_inode), actual_mtime, actual_size, checksum, current_timestamp(), ), ) self.inserts += 1 def get_state_record_for_inode(self, inode): cmd = ( "SELECT mtime, size, md5, timestamp from {} WHERE " "inode=?".format(self.STATE_TABLE) ) self._execute(cmd, (self._to_sqlite(inode),)) results = self._fetchall() if results: # uniqueness constrain on inode assert len(results) == 1 return results[0] return None def save(self, path_info, fs, hash_info): """Save hash for the specified path info. Args: path_info (dict): path_info to save hash for. hash_info (HashInfo): hash to save. """ if not isinstance(fs, LocalFileSystem): return assert isinstance(path_info, str) or path_info.scheme == "local" assert hash_info assert isinstance(hash_info, HashInfo) assert os.path.exists(path_info) actual_mtime, actual_size = get_mtime_and_size(path_info, self.fs) actual_inode = get_inode(path_info) existing_record = self.get_state_record_for_inode(actual_inode) if not existing_record: self._insert_new_state_record( actual_inode, actual_mtime, actual_size, hash_info.value ) return self._update_state_for_path_changed( actual_inode, actual_mtime, actual_size, hash_info.value ) def get(self, path_info, fs): """Gets the hash for the specified path info. Hash will be retrieved from the state database if available. Args: path_info (dict): path info to get the hash for. Returns: HashInfo or None: hash for the specified path info or None if it doesn't exist in the state database. """ if not isinstance(fs, LocalFileSystem): return None assert isinstance(path_info, str) or path_info.scheme == "local" path = os.fspath(path_info) # NOTE: use os.path.exists instead of LocalFileSystem.exists # because it uses lexists() and will return True for broken # symlinks that we cannot stat() in get_mtime_and_size if not os.path.exists(path): return None actual_mtime, actual_size = get_mtime_and_size(path, self.fs) actual_inode = get_inode(path) existing_record = self.get_state_record_for_inode(actual_inode) if not existing_record: return None mtime, size, value, _ = existing_record if self._file_metadata_changed(actual_mtime, mtime, actual_size, size): return None self._update_state_record_timestamp_for_inode(actual_inode) return HashInfo("md5", value, size=int(actual_size)) def save_link(self, path_info, fs): """Adds the specified path to the list of links created by dvc. This list is later used on `dvc checkout` to cleanup old links. Args: path_info (dict): path info to add to the list of links. """ if not isinstance(fs, LocalFileSystem): return assert isinstance(path_info, str) or path_info.scheme == "local" if not self.fs.exists(path_info): return mtime, _ = get_mtime_and_size(path_info, self.fs) inode = get_inode(path_info) relative_path = relpath(path_info, self.root_dir) cmd = "REPLACE INTO {}(path, inode, mtime) " "VALUES (?, ?, ?)".format( self.LINK_STATE_TABLE ) self._execute(cmd, (relative_path, self._to_sqlite(inode), mtime)) def get_unused_links(self, used, fs): """Removes all saved links except the ones that are used. Args: used (list): list of used links that should not be removed. """ if not isinstance(fs, LocalFileSystem): return unused = [] self._execute(f"SELECT * FROM {self.LINK_STATE_TABLE}") for row in self.cursor: relative_path, inode, mtime = row inode = self._from_sqlite(inode) path = os.path.join(self.root_dir, relative_path) if path in used or not self.fs.exists(path): continue actual_inode = get_inode(path) actual_mtime, _ = get_mtime_and_size(path, self.fs) if (inode, mtime) == (actual_inode, actual_mtime): logger.debug("Removing '%s' as unused link.", path) unused.append(relative_path) return unused def remove_links(self, unused, fs): if not isinstance(fs, LocalFileSystem): return for path in unused: remove(path) for chunk_unused in to_chunks( unused, chunk_size=SQLITE_MAX_VARIABLES_NUMBER ): cmd = "DELETE FROM {} WHERE path IN ({})".format( self.LINK_STATE_TABLE, ",".join(["?"] * len(chunk_unused)) ) self._execute(cmd, tuple(chunk_unused))