class Trie(ABCTrie): def __init__(self, data): chars = set() for key in data.keys(): if not isinstance(key, unicode): raise TypeError(u"All keys must be strings") for char in key: chars.add(char) self._data = DATrie(u"".join(chars)) for key, value in data.items(): self._data[key] = value __init__.func_annotations = {} def __contains__(self, key): return key in self._data __contains__.func_annotations = {} def __len__(self): return len(self._data) __len__.func_annotations = {} def __iter__(self): raise NotImplementedError() __iter__.func_annotations = {} def __getitem__(self, key): return self._data[key] __getitem__.func_annotations = {} def keys(self, prefix=None): return self._data.keys(prefix) keys.func_annotations = {} def has_keys_with_prefix(self, prefix): return self._data.has_keys_with_prefix(prefix) has_keys_with_prefix.func_annotations = {} def longest_prefix(self, prefix): return self._data.longest_prefix(prefix) longest_prefix.func_annotations = {} def longest_prefix_item(self, prefix): return self._data.longest_prefix_item(prefix) longest_prefix_item.func_annotations = {}
def start_tracking_deletions(self): """ Starts tracking which subtrees have been deleted so that update_hwm can skip updates to keys that have subsequently been deleted. Should be paired with a call to stop_tracking_deletions() to release the associated tracking data structures. """ _log.info("Started tracking deletions") self._deletion_hwms = Trie(TRIE_CHARS) self._latest_deletion = None
def __init__(self): # We use a trie to track the highest etcd index at which we've seen # each key. The trie implementation forces a fixed character set; # we explicitly allow the characters we expect and encode any others # that we're not expecting. self._hwms = Trie(TRIE_CHARS) # Set to a Trie while we're tracking deletions. None otherwise. self._deletion_hwms = None # Optimization: tracks the highest etcd index at which we've seen a # deletion. This allows us to skip an expensive lookup in the # _deletion_hwms trie for events that come after the deletion. self._latest_deletion = None
def trie_unpickler(bytes): handle, path = tempfile.mkstemp() with file(path, 'wb') as tmp: tmp.write(bytes) try: with file(path, 'rb') as tmp: return Trie.read(tmp) finally: os.unlink(path)
def __init__(self, data): chars = set() for key in data.keys(): if not isinstance(key, text_type): raise TypeError("All keys must be strings") for char in key: chars.add(char) self._data = DATrie("".join(chars)) for key, value in data.items(): self._data[key] = value
class Trie(ABCTrie): def __init__(self, data): chars = set() for key in data.keys(): if not isinstance(key, text_type): raise TypeError("All keys must be strings") for char in key: chars.add(char) self._data = DATrie("".join(chars)) for key, value in data.items(): self._data[key] = value def __contains__(self, key): return key in self._data def __len__(self): return len(self._data) def __iter__(self): raise NotImplementedError() def __getitem__(self, key): return self._data[key] def keys(self, prefix=None): return self._data.keys(prefix) def has_keys_with_prefix(self, prefix): return self._data.has_keys_with_prefix(prefix) def longest_prefix(self, prefix): return self._data.longest_prefix(prefix) def longest_prefix_item(self, prefix): return self._data.longest_prefix_item(prefix)
class HighWaterTracker(object): """ Tracks the highest etcd index for which we've seen a particular etcd key. This class is expected to be used as follows: Starting with a resync, while also merging events from our watch on etcd: * Call start_tracking_deletions() to enable resolution between events and the snapshot. * Repeatedly call update_hwm() and store_deletion(), feeding in the data from the snapshot and event stream. * At the end of the snapshot processing, call stop_tracking_deletions() to discard the tracking metadata (which would otherwise grow indefinitely). * Call remove_old_keys() to find and delete any keys that have not been seen since before the snapshot was started, and hence must have been deleted before the snapshot was taken. While in sync: * feed in events with update_hwm() and store_deletion(). At any point, if a new resync is required restart from "Call start_tracking_deletions()..." """ def __init__(self): # We use a trie to track the highest etcd index at which we've seen # each key. The trie implementation forces a fixed character set; # we explicitly allow the characters we expect and encode any others # that we're not expecting. self._hwms = Trie(TRIE_CHARS) # Set to a Trie while we're tracking deletions. None otherwise. self._deletion_hwms = None # Optimization: tracks the highest etcd index at which we've seen a # deletion. This allows us to skip an expensive lookup in the # _deletion_hwms trie for events that come after the deletion. self._latest_deletion = None def start_tracking_deletions(self): """ Starts tracking which subtrees have been deleted so that update_hwm can skip updates to keys that have subsequently been deleted. Should be paired with a call to stop_tracking_deletions() to release the associated tracking data structures. """ _log.info("Started tracking deletions") self._deletion_hwms = Trie(TRIE_CHARS) self._latest_deletion = None def stop_tracking_deletions(self): """ Stops deletion tracking and frees up the associated resources. Calling this asserts that subsequent calls to update_hwm() will only use HWMs after any stored deletes. """ _log.info("Stopped tracking deletions") self._deletion_hwms = None self._latest_deletion = None def update_hwm(self, key, new_mod_idx): """ Updates the HWM for a key if the new value is greater than the old. If deletion tracking is enabled, resolves deletions so that updates to subtrees that have been deleted are skipped iff the deletion is after the update in HWM order. :return int|NoneType: the old HWM of the key (or the HWM at which it was deleted) or None if it did not previously exist. """ _log.debug("Updating HWM for %s to %s", key, new_mod_idx) key = encode_key(key) if (self._deletion_hwms is not None and # Optimization: avoid expensive lookup if this update comes # after all deletions. new_mod_idx < self._latest_deletion): # We're tracking deletions, check that this key hasn't been # deleted. del_hwm = self._deletion_hwms.longest_prefix_value(key, None) if new_mod_idx < del_hwm: _log.debug("Key %s previously deleted, skipping", key) return del_hwm try: old_hwm = self._hwms[key] # Trie doesn't have get(). except KeyError: old_hwm = None if old_hwm < new_mod_idx: # Works for None too. _log.debug("Key %s HWM updated to %s, previous %s", key, new_mod_idx, old_hwm) self._hwms[key] = new_mod_idx return old_hwm def store_deletion(self, key, deletion_mod_idx): """ Store that a given key (or directory) was deleted at a given HWM. :return: List of known keys that were deleted. This will be the leaves only when a subtree is being deleted. """ _log.debug("Key %s deleted", key) key = encode_key(key) self._latest_deletion = max(deletion_mod_idx, self._latest_deletion) if self._deletion_hwms is not None: _log.debug("Tracking deletion in deletions trie") self._deletion_hwms[key] = deletion_mod_idx deleted_keys = [] for child_key, child_mod in self._hwms.items(key): del self._hwms[child_key] deleted_keys.append(decode_key(child_key)) _log.debug("Found %s keys deleted under %s", len(deleted_keys), key) return deleted_keys def remove_old_keys(self, hwm_limit): """ Deletes and returns all keys that have HWMs less than hwm_limit. :return: list of keys that were deleted. """ assert not self._deletion_hwms, \ "Delete tracking incompatible with remove_old_keys()" _log.info("Removing keys that are older than %s", hwm_limit) old_keys = [] state = datrie.State(self._hwms) state.walk(u"") it = datrie.Iterator(state) while it.next(): value = it.data() if value < hwm_limit: old_keys.append(it.key()) for old_key in old_keys: del self._hwms[old_key] _log.info("Deleted %s old keys", len(old_keys)) return map(decode_key, old_keys) def __len__(self): return len(self._hwms)
def __init__(self, trie=Trie(string.printable), untrieable={}): self.trie = trie self.untrieable = untrieable self.max_index = 2 ** 28
def load(cls, prefix): trie = Trie.load(prefix + ".trie") with open(prefix + ".utrie", 'rb') as out_f: untrieable = pickle.load(out_f) obj = cls(trie, untrieable) return obj
# need fast trie (prefix tree) data structure. Choose from 3 libraries with (almost) compatible APIs. if False: from datrie import Trie # pip install datrie trie_path = 'datrie.dump' elif False: from marisa_trie import Trie # pip install marisa-trie trie_path = 'marisa_trie.dump' else: from dawg import CompletionDAWG as Trie # pip install dawg trie_path = 'dawg.dump' if os.path.exists(trie_path): # here's one i built earlier if Trie.__module__ == 'dawg': trie = Trie() trie.load(trie_path) else: trie = Trie.load(trie_path) else: dict_path = "garbled_email_dictionary.txt" if not os.path.exists(dict_path): # download code jam's dictionary dict_url = "https://code.google.com/codejam/contest/static/garbled_email_dictionary.txt" from urllib.request import urlretrieve urlretrieve(dict_url, dict_path) with open(dict_path) as f: words = [line.strip() for line in f.readlines()] if Trie.__module__ == 'datrie':