def testGetThenRefreshOnStartup(self): # Regression test: Test that calling Get() but never resolving the future, # then Refresh()ing the data, causes the data to be refreshed. test_bundle = _TestBundle() gfs, fetcher = test_bundle.CreateGfsAndFetcher() self.assertTrue(*fetcher.CheckAndReset()) # Get a predictable version. version, data = test_bundle.Mutate() read_future = gfs.ReadSingle('hello.txt') # Fetch for the Stat(), async-fetch for the Read(). self.assertTrue(*fetcher.CheckAndReset(fetch_count=1, fetch_async_count=1)) refresh_future = gfs.Refresh() self.assertTrue(*fetcher.CheckAndReset()) self.assertEqual(data, read_future.Get()) self.assertTrue(*fetcher.CheckAndReset(fetch_resolve_count=1)) self.assertEqual(StatInfo(version), gfs.Stat('hello.txt')) self.assertTrue(*fetcher.CheckAndReset()) # The fetch will already have been resolved, so resolving the Refresh won't # affect anything. refresh_future.Get() self.assertTrue(*fetcher.CheckAndReset()) # Read data should not have changed. self.assertEqual(data, gfs.ReadSingle('hello.txt').Get()) self.assertEqual(StatInfo(version), gfs.Stat('hello.txt')) self.assertTrue(*fetcher.CheckAndReset())
def Stat(self, path): version = self._object_store.Get(path, object_store.GITHUB_STAT).Get() if version is not None: return StatInfo(version) try: result = self._fetcher.Fetch('commits/HEAD', username=USERNAME, password=PASSWORD) except urlfetch.DownloadError as e: logging.error('GithubFileSystem Stat: %s' % e) return self._DefaultStat(path) # Check if Github authentication failed. if result.status_code == 401: logging.error( 'Github authentication failed for %s, falling back to ' 'unauthenticated.' % USERNAME) try: result = self._fetcher.Fetch('commits/HEAD') except urlfetch.DownloadError as e: logging.error('GithubFileSystem Stat: %s' % e) return self._DefaultStat(path) version = (json.loads(result.content).get('commit', {}).get('tree', {}).get('sha', None)) # Check if the JSON was valid, and set to 0 if not. if version is not None: self._object_store.Set(path, version, object_store.GITHUB_STAT) else: logging.warning('Problem fetching commit hash from github.') return self._DefaultStat(path) return StatInfo(version)
def Stat(self, path): read_result = self.ReadSingle(path) stat_result = StatInfo(str(self._stat)) if isinstance(read_result, list): stat_result.child_versions = dict((file_result, str(self._stat)) for file_result in read_result) return stat_result
def Stat(self, path): '''Stats |path| returning its version as as StatInfo object. If |path| ends with a '/', it is assumed to be a directory and the StatInfo object returned includes child_versions for all paths in the directory. File paths do not include the name of the zip file, which is arbitrary and useless to consumers. Because the repository will only be downloaded once per server version, all stat versions are always 0. ''' # Trim off the zip file's name. path = path.lstrip('/') trimmed = [f.split('/', 1)[1] for f in self._GetNamelist()] if path not in trimmed: raise FileNotFoundError("No stat found for '%s' in %s" % (path, trimmed)) version = self._GetVersion() child_paths = {} if path == '' or path.endswith('/'): # Deal with a directory for f in filter(lambda s: s.startswith(path), trimmed): filename = f[len(path):] if not '/' in filename and not f == path: child_paths[filename] = StatInfo(version) return StatInfo(version, child_paths or None)
def Stat(self, path, stats=None): """Stats the directory given, or if a file is given, stats the files parent directory to get info about the file. """ # TODO(kalman): store the whole stat info, not just the version. version = self._object_store.Get(path, object_store.FILE_SYSTEM_STAT).Get() if version is not None: return StatInfo(version) # Always stat the parent directory, since it will have the stat of the child # anyway, and this gives us an entire directory's stat info at once. if path.endswith('/'): dir_path = path else: dir_path = path.rsplit('/', 1)[0] + '/' dir_stat = self._file_system.Stat(dir_path) if path == dir_path: version = dir_stat.version else: version = dir_stat.child_versions.get(path.split('/')[-1], None) if version is None: raise FileNotFoundError(path) mapping = {path: version} for child_path, child_version in dir_stat.child_versions.iteritems(): child_path = dir_path + child_path mapping[child_path] = child_version self._object_store.SetMulti(mapping, object_store.FILE_SYSTEM_STAT) return StatInfo(version)
def Stat(self, path): version = self._stat_object_store.Get(path).Get() if version is not None: return StatInfo(version) try: result = self._fetcher.Fetch('commits/HEAD', username=USERNAME, password=PASSWORD) except urlfetch.DownloadError as e: logging.warning('GithubFileSystem Stat: %s' % e) return self._DefaultStat(path) # Check if Github authentication failed. if result.status_code == 401: logging.warning('Github authentication failed for %s, falling back to ' 'unauthenticated.' % USERNAME) try: result = self._fetcher.Fetch('commits/HEAD') except urlfetch.DownloadError as e: logging.warning('GithubFileSystem Stat: %s' % e) return self._DefaultStat(path) # Parse response JSON - but sometimes github gives us invalid JSON. try: version = json.loads(result.content)['sha'] self._stat_object_store.Set(path, version) return StatInfo(version) except StandardError as e: logging.warning( ('%s: got invalid or unexpected JSON from github. Response status ' + 'was %s, content %s') % (e, result.status_code, result.content)) return self._DefaultStat(path)
def Stat(self, path): '''Stats |path| returning its version as as StatInfo object. If |path| ends with a '/', it is assumed to be a directory and the StatInfo object returned includes child_versions for all paths in the directory. File paths do not include the name of the zip file, which is arbitrary and useless to consumers. Because the repository will only be downloaded once per server version, all stat versions are always 0. ''' self._EnsureRepoZip() repo_zip = self._repo_zip.Get() if path not in repo_zip.Paths(): raise FileNotFoundError('"%s" does not contain file "%s"' % (self._repo_key, path)) version = self._stat_cache.Get(self._repo_key).Get() assert version is not None, ('There was a zipball in datastore; there ' 'should be a version cached for it') stat_info = StatInfo(version) if IsDirectory(path): stat_info.child_versions = dict( (p, StatInfo(version)) for p in repo_zip.List(path)) return stat_info
def Stat(self, path): '''Stats |path| returning its version as as StatInfo object. If |path| ends with a '/', it is assumed to be a directory and the StatInfo object returned includes child_versions for all paths in the directory. File paths do not include the name of the zip file, which is arbitrary and useless to consumers. Because the repository will only be downloaded once per server version, all stat versions are always 0. ''' self._EnsureRepoZip() repo_zip = self._repo_zip.Get() if path not in repo_zip.Paths(): raise FileNotFoundError( '"%s" does not contain file "%s"' % (self._repo_key, path)) version = self._stat_cache.Get(self._repo_key).Get() assert version is not None, ('There was a zipball in datastore; there ' 'should be a version cached for it') stat_info = StatInfo(version) if IsDirectory(path): stat_info.child_versions = dict( (p, StatInfo(version)) for p in repo_zip.List(path)) return stat_info
def Stat(self, path): version = self._object_store.Get(path, object_store.GITHUB_STAT).Get() if version is not None: return StatInfo(version) version = json.loads( self._fetcher.Fetch('commits/HEAD').content)['commit']['tree']['sha'] self._object_store.Set(path, version, object_store.GITHUB_STAT) return StatInfo(version)
def Stat(self, path): read_result = self.Read([path]).Get().get(path) stat_result = StatInfo(self._SinglePathStat(path)) if isinstance(read_result, list): stat_result.child_versions = dict( (file_result, self._SinglePathStat('%s%s' % (path, file_result))) for file_result in read_result) return stat_result
def Stat(self, path): read_result = self.ReadSingle(path).Get() stat_result = StatInfo(str(self._stat_tracker.GetVersion(path))) if isinstance(read_result, list): stat_result.child_versions = dict( (file_result, str(self._stat_tracker.GetVersion('%s%s' % (path, file_result)))) for file_result in read_result) return stat_result
def run(): file_system = self._CreateCachingFileSystem(mock_fs, start_empty=True) self.assertEqual( StatInfo(stat, child_versions={'bob0': stat, 'bob1': stat}), file_system.Stat('bob/')) self.assertTrue(*mock_fs.CheckAndReset(stat_count=1)) self.assertEqual(StatInfo(stat), file_system.Stat('bob/bob0')) self.assertEqual(StatInfo(stat), file_system.Stat('bob/bob0')) self.assertTrue(*mock_fs.CheckAndReset())
def _CreateStatInfo(self, html): def inner_text(node): '''Like node.innerText in JS DOM, but strips surrounding whitespace. ''' text = [] if node.nodeValue: text.append(node.nodeValue) if hasattr(node, 'childNodes'): for child_node in node.childNodes: text.append(inner_text(child_node)) return ''.join(text).strip() dom = self._ParseHTML(html) # Try all of the tables until we find the one that contains the data. for table in dom.getElementsByTagName('table'): # Within the table there is a list of files. However, there may be some # things beforehand; a header, "parent directory" list, etc. We will deal # with that below by being generous and just ignoring such rows. rows = table.getElementsByTagName('tr') child_versions = {} for row in rows: # Within each row there are probably 5 cells; name, version, age, # author, and last log entry. Maybe the columns will change; we're at # the mercy viewvc, but this constant can be easily updated. elements = row.getElementsByTagName('td') if len(elements) != 5: continue name_element, version_element, _, __, ___ = elements name = inner_text( name_element) # note: will end in / for directories try: version = int(inner_text(version_element)) except ValueError: continue child_versions[name] = version if not child_versions: continue # Parent version is max version of all children, since it's SVN. parent_version = max(child_versions.values()) # All versions in StatInfo need to be strings. return StatInfo( str(parent_version), dict((path, str(version)) for path, version in child_versions.iteritems())) # Bleh, but, this data is so unreliable. There are actually some empty file # listings caused by git/svn/something not cleaning up empty dirs. return StatInfo('0', {})
def testStat(self): self._gfs.Refresh().Get() dir_stat = StatInfo( FAKE_HASH, { 'hello.notpy': StatInfo(FAKE_HASH), '__init__.notpy': StatInfo(FAKE_HASH) }) self.assertEqual(StatInfo(FAKE_HASH), self._gfs.Stat('README.md')) self.assertEqual(StatInfo(FAKE_HASH), self._gfs.Stat('src/hello.notpy')) self.assertEqual(dir_stat, self._gfs.Stat('src/'))
def testRefresh(self): test_bundle = _TestBundle() gfs, fetcher = test_bundle.CreateGfsAndFetcher() # It shouldn't fetch until Refresh does so; then it will do 2, one for the # stat, and another for the read. self.assertTrue(*fetcher.CheckAndReset()) gfs.Refresh().Get() self.assertTrue(*fetcher.CheckAndReset(fetch_count=1, fetch_async_count=1, fetch_resolve_count=1)) # Refresh is just an alias for Read(''). gfs.Refresh().Get() self.assertTrue(*fetcher.CheckAndReset()) initial_dir_read = sorted(gfs.ReadSingle('').Get()) initial_file_read = gfs.ReadSingle('dir/file1').Get() version, data = test_bundle.Mutate() # Check that changes have not effected the file system yet. self.assertEqual(initial_dir_read, sorted(gfs.ReadSingle('').Get())) self.assertEqual(initial_file_read, gfs.ReadSingle('dir/file1').Get()) self.assertNotEqual(StatInfo(version), gfs.Stat('')) gfs, fetcher = test_bundle.CreateGfsAndFetcher() gfs.Refresh().Get() self.assertTrue(*fetcher.CheckAndReset(fetch_count=1, fetch_async_count=1, fetch_resolve_count=1)) # Check that the changes have affected the file system. self.assertEqual(data, gfs.ReadSingle('new-file').Get()) self.assertEqual(test_bundle.files['zipfile/dir/file1'], gfs.ReadSingle('dir/file1').Get()) self.assertEqual(StatInfo(version), gfs.Stat('new-file')) # Regression test: ensure that reading the data after it's been mutated, # but before Refresh() has been realised, still returns the correct data. gfs, fetcher = test_bundle.CreateGfsAndFetcher() version, data = test_bundle.Mutate() refresh_future = gfs.Refresh() self.assertTrue(*fetcher.CheckAndReset(fetch_count=1, fetch_async_count=1)) self.assertEqual(data, gfs.ReadSingle('new-file').Get()) self.assertEqual(test_bundle.files['zipfile/dir/file1'], gfs.ReadSingle('dir/file1').Get()) self.assertEqual(StatInfo(version), gfs.Stat('new-file')) refresh_future.Get() self.assertTrue(*fetcher.CheckAndReset(fetch_resolve_count=1))
def testStat(self): # This is the hash value from the zip on disk. real_hash = 'c36fc23688a9ec9e264d3182905dc0151bfff7d7' self._gfs.Refresh().Get() dir_stat = StatInfo(real_hash, { 'hello.notpy': StatInfo(real_hash), '__init__.notpy': StatInfo(real_hash) }) self.assertEqual(StatInfo(real_hash), self._gfs.Stat('README.md')) self.assertEqual(StatInfo(real_hash), self._gfs.Stat('src/hello.notpy')) self.assertEqual(dir_stat, self._gfs.Stat('src/'))
def Stat(self, path): self._stat_count += 1 children = dict((path.strip('/') + str(i), self.stat_value) for i in range(5)) if not path.endswith('/'): children[path.rsplit('/', 1)[-1]] = self.stat_value return StatInfo(self.stat_value, children)
def testDirStat(self): file_system = self._CreateSubversionFileSystem() stat_info = file_system.Stat('stat/') expected = StatInfo( '151113', child_versions=json.loads(self._ReadLocalFile('stat_result.json'))) self.assertEqual(expected, stat_info)
def Stat(self, path): '''Stats the directory given, or if a file is given, stats the file's parent directory to get info about the file. ''' # Always stat the parent directory, since it will have the stat of the child # anyway, and this gives us an entire directory's stat info at once. dir_path, file_path = posixpath.split(path) if dir_path and not dir_path.endswith('/'): dir_path += '/' # ... and we only ever need to cache the dir stat, too. dir_stat = self._stat_object_store.Get(dir_path).Get() if dir_stat is None: dir_stat = self._file_system.Stat(dir_path) assert dir_stat is not None # should raise a FileNotFoundError self._stat_object_store.Set(dir_path, dir_stat) if path == dir_path: stat_info = dir_stat else: file_version = dir_stat.child_versions.get(file_path) if file_version is None: raise FileNotFoundError( 'No stat found for %s in %s (found %s)' % (path, dir_path, dir_stat.child_versions)) stat_info = StatInfo(file_version) return stat_info
def _CreateStatInfo(json_data): '''Returns a StatInfo object comprised of the tree ID for |json_data|, as well as the tree IDs for the entries in |json_data|. ''' tree = _ParseGitilesJson(json_data) return StatInfo(tree['id'], dict((e['name'], e['id']) for e in tree['entries']))
def Stat(self, path): directory, filename = posixpath.split(path) directory += '/' if self._revision is not None: # |stat_fetch| uses viewvc which uses pathrev= for version. directory += '?pathrev=%s' % self._revision try: result = self._stat_fetcher.Fetch(directory) except Exception as e: raise FileSystemError('Error fetching %s for Stat: %s' % (path, traceback.format_exc())) if result.status_code == 404: raise FileNotFoundError( 'Got 404 when fetching %s for Stat, content %s' % (path, result.content)) if result.status_code != 200: raise FileNotFoundError( 'Got %s when fetching %s for Stat, content %s' % (result.status_code, path, result.content)) stat_info = _CreateStatInfo(result.content) if stat_info.version is None: raise FileSystemError('Failed to find version of dir %s' % directory) if path.endswith('/'): return stat_info if filename not in stat_info.child_versions: raise FileNotFoundError( '%s from %s was not in child versions for Stat' % (filename, path)) return StatInfo(stat_info.child_versions[filename])
def resolve(): try: result = result_future.Get() except Exception as e: exc_type = FileNotFoundError if IsDownloadError( e) else FileSystemError raise exc_type( '%s fetching %s for Stat: %s' % (type(e).__name__, path, traceback.format_exc())) if result.status_code == 404: raise FileNotFoundError('Got 404 when fetching %s for Stat, ' 'content %s' % (path, result.content)) if result.status_code != 200: raise FileNotFoundError( 'Got %s when fetching %s for Stat, content %s' % (result.status_code, path, result.content)) stat_info = _CreateStatInfo(result.content) if stat_info.version is None: raise FileSystemError('Failed to find version of dir %s' % directory) if path == '' or path.endswith('/'): return stat_info if filename not in stat_info.child_versions: raise FileNotFoundError( '%s from %s was not in child versions for Stat' % (filename, path)) return StatInfo(stat_info.child_versions[filename])
def StatAsync(self, path): def get_child_versions(path): return dict((e['name'], e['id']) for e in local_git_util.ListDir(path, self._commit)) def get_file_version(dir, filename): try: return next(e['id'] for e in local_git_util.ListDir(dir, self._commit) if e['name'] == filename) except StopIteration: raise FileNotFoundError('%s not found in revision %s' % (path, self._commit)) dir, filename = posixpath.split(path) if path == '': version = local_git_util.GetRootTree(self._commit) child_versions = get_child_versions('') elif IsDirectory(path): parent_dir, stat_dir = posixpath.split(dir) version = get_file_version(parent_dir, stat_dir) child_versions = get_child_versions(dir) else: version = get_file_version(dir, filename) child_versions = None #print 'Accessing local git for stat on %s (%s)' % (path, version) return Future(value=StatInfo(version, child_versions))
def testFileStat(self): file_system, fetcher = _CreateSubversionFileSystem( _SHARED_FILE_SYSTEM_TEST_DATA) stat_info = file_system.Stat('stat/extension_api.h') self.assertTrue(*fetcher.CheckAndReset(async_count=1, async_resolve_count=1)) self.assertEqual(StatInfo('146163'), stat_info)
def Stat(self, path): directory, filename = posixpath.split(path) directory += '/' if self._revision is not None: # |stat_fetch| uses viewvc which uses pathrev= for version. directory += '?pathrev=%s' % self._revision try: result = self._stat_fetcher.Fetch(directory) except Exception as e: # Convert all errors (typically some sort of DeadlineExceededError but # explicitly catching that seems not to work) to a FileNotFoundError to # reduce the exception-catching surface area of this class. raise FileNotFoundError('%s fetching %s for Stat: %s' % (e.__class__.__name__, path, e)) if result.status_code != 200: raise FileNotFoundError('Got %s when fetching %s for Stat' % (result.status_code, path)) stat_info = _CreateStatInfo(result.content) if stat_info.version is None: raise ValueError('Failed to find version of dir %s' % directory) if path.endswith('/'): return stat_info if filename not in stat_info.child_versions: raise FileNotFoundError( '%s from %s was not in child versions for Stat' % (filename, path)) return StatInfo(stat_info.child_versions[filename])
def testCreateStatInfo(self): test_json = '\n'.join([ ')]}\'', json.dumps({ 'id': 'some_long_string', 'entries': [{ 'mode': 33188, 'type': 'blob', 'id': 'long_id', 'name': '.gitignore' }, { 'mode': 33188, 'type': 'blob', 'id': 'another_long_id', 'name': 'PRESUBMIT.py' }, { 'mode': 33188, 'type': 'blob', 'id': 'yali', 'name': 'README' }] }) ]) expected_stat_info = StatInfo( 'some_long_string', { '.gitignore': 'long_id', 'PRESUBMIT.py': 'another_long_id', 'README': 'yali' }) self.assertEqual(_CreateStatInfo(test_json), expected_stat_info)
def _CreateStatInfo(html): parent_version = None child_versions = {} # Try all of the tables until we find the ones that contain the data (the # directory and file versions are in different tables). for table in _ParseHTML(html).getElementsByTagName('table'): # Within the table there is a list of files. However, there may be some # things beforehand; a header, "parent directory" list, etc. We will deal # with that below by being generous and just ignoring such rows. rows = table.getElementsByTagName('tr') for row in rows: cells = row.getElementsByTagName('td') # The version of the directory will eventually appear in the soup of # table rows, like this: # # <tr> # <td>Directory revision:</td> # <td><a href=... title="Revision 214692">214692</a> (of...)</td> # </tr> # # So look out for that. if len(cells) == 2 and _InnerText( cells[0]) == 'Directory revision:': links = cells[1].getElementsByTagName('a') if len(links) != 2: raise FileSystemError( 'ViewVC assumption invalid: directory ' + 'revision content did not have 2 <a> ' + ' elements, instead %s' % _InnerText(cells[1])) this_parent_version = _InnerText(links[0]) int(this_parent_version) # sanity check if parent_version is not None: raise FileSystemError( 'There was already a parent version %s, and ' + ' we just found a second at %s' % (parent_version, this_parent_version)) parent_version = this_parent_version # The version of each file is a list of rows with 5 cells: name, version, # age, author, and last log entry. Maybe the columns will change; we're # at the mercy viewvc, but this constant can be easily updated. if len(cells) != 5: continue name_element, version_element, _, __, ___ = cells name = _InnerText( name_element) # note: will end in / for directories try: version = int(_InnerText(version_element)) except StandardError: continue child_versions[name] = str(version) if parent_version and child_versions: break return StatInfo(parent_version, child_versions)
def testDirStat(self): file_system = _CreateSubversionFileSystem(_SHARED_FILE_SYSTEM_TEST_DATA) stat_info = file_system.Stat('stat/') expected = StatInfo( '151113', child_versions=json.loads(test_util.ReadFile('%s/stat_result.json' % _SHARED_FILE_SYSTEM_TEST_DATA))) self.assertEqual(expected, stat_info)
def _DefaultStat(self, path): version = 0 # TODO(kalman): we should replace all of this by wrapping the # GithubFileSystem in a CachingFileSystem. A lot of work has been put into # CFS to be robust, and GFS is missing out. # For example: the following line is wrong, but it could be moot. self._stat_object_store.Set(path, version) return StatInfo(version)
def _DefaultStat(self, path): version = 0 # Cache for a minute so we don't try to keep fetching bad data. self._object_store.Set(path, version, object_store.GITHUB_STAT, time=60) return StatInfo(version)
def testCachedStat(self): test_fs = TestFileSystem({ 'bob': { 'bob0': 'bob/bob0 contents', 'bob1': 'bob/bob1 contents' } }) mock_fs = MockFileSystem(test_fs) file_system = self._CreateCachingFileSystem(mock_fs, start_empty=False) self.assertEqual(StatInfo('0'), file_system.Stat('bob/bob0')) self.assertTrue(*mock_fs.CheckAndReset(stat_count=1)) self.assertEqual(StatInfo('0'), file_system.Stat('bob/bob0')) self.assertTrue(*mock_fs.CheckAndReset()) # Caching happens on a directory basis, so reading other files from that # directory won't result in a stat. self.assertEqual(StatInfo('0'), file_system.Stat('bob/bob1')) self.assertEqual( StatInfo('0', child_versions={'bob0': '0', 'bob1': '0'}), file_system.Stat('bob/')) self.assertTrue(*mock_fs.CheckAndReset()) # Even though the stat is bumped, the object store still has it cached so # this won't update. test_fs.IncrementStat() self.assertEqual(StatInfo('0'), file_system.Stat('bob/bob0')) self.assertEqual(StatInfo('0'), file_system.Stat('bob/bob1')) self.assertEqual( StatInfo('0', child_versions={'bob0': '0', 'bob1': '0'}), file_system.Stat('bob/')) self.assertTrue(*mock_fs.CheckAndReset())
def Stat(self, path): version = self._object_store.Get(path, object_store.GITHUB_STAT).Get() if version is not None: return StatInfo(version) version = (json.loads(self._fetcher.Fetch('commits/HEAD').content).get( 'commit', {}).get('tree', {}).get('sha', None)) # Check if the JSON was valid, and set to 0 if not. if version is not None: self._object_store.Set(path, version, object_store.GITHUB_STAT) else: logging.warning('Problem fetching commit hash from github.') version = 0 # Cache for a minute so we don't try to keep fetching bad data. self._object_store.Set(path, version, object_store.GITHUB_STAT, time=60) return StatInfo(version)