def test_bisect_key(): temp = SortedDict(modulo, ((val, val) for val in range(100))) temp._reset(7) assert all(temp.bisect(val) == ((val % 10) + 1) * 10 for val in range(100)) assert all( temp.bisect_right(val) == ((val % 10) + 1) * 10 for val in range(100)) assert all(temp.bisect_left(val) == (val % 10) * 10 for val in range(100))
class LCM(object): def __init__(self, min_supp=.005, n_jobs=1, verbose=1, return_tids=False): self.min_support = min_supp self.item_to_tids = SortedDict() self.verbose = verbose self.format = self._format_with_tids if return_tids else self._format self.extra_col = 'tids' if return_tids else 'support' self.n_transactions = 0 def add(self, transaction): transaction = frozenset(transaction) for item in transaction: if item in self.item_to_tids: self.item_to_tids[item].add(self.n_transactions) else: self.item_to_tids[item] = RoaringBitmap([self.n_transactions]) self.n_transactions += 1 def discover_yield(self): items = [e for e in self.item_to_tids.items() if len(e[1]) >= self.min_support] for key, key_idxs in items: if len(key_idxs) >= self.min_support: yield from self._inner(frozenset(), key_idxs, key, items) def discover(self): data = self.discover_yield() return pd.DataFrame.from_records(data=data, columns=['itemset', self.extra_col]) def _format_with_tids(self, p_prime, p_idxs): return p_prime, p_idxs def _format(self, p_prime, p_idxs): return p_prime, len(p_idxs) def get_new_scope_keys(self, new_items, p_prime, p_idxs, limit): for new_limit, limit_idxs in new_items: if new_limit not in p_prime: inter_len = p_idxs.intersection_len(limit_idxs) if inter_len >= self.min_support: new_pidxs = p_idxs.intersection(limit_idxs) yield limit, new_pidxs def _inner(self, p, p_idxs, limit, scope_items): cp = (k for k, idxs in reversed(scope_items) if p_idxs.issubset(idxs)) max_k = next(cp, None) if max_k and max_k == limit: cp = set(cp).union({max_k}) p_prime = p.union(cp) yield self.format(p_prime, p_idxs) new_items = self.item_to_tids.items()[:self.item_to_tids.bisect(limit)] for new_limit, new_pidxs in self.get_new_scope_keys(new_items, p_prime, p_idxs, limit): yield from self._inner(p_prime, new_pidxs, new_limit, scope_items) cp.clear()
class BisectNodePolicy(BaseNodePolicy): def __init__(self, hash_class=defaultHashClass): self.ring = SortedDict() super(BisectNodePolicy, self).__init__(hash_class=hash_class) def add_node(self, node=None, vnode_count=None): for i in range_(int(vnode_count)): self.ring[self._gen_key(node, i)] = node def remove_node(self, node=None): keys = list(self.ring.keys()) for key in keys: if self.ring[key] == node: self.ring.pop(key) def get_proper_node(self, key): key, _ = self.ring.peekitem(self._find_proper_pos(key)) return self.ring[key] def _find_proper_pos(self, key): key = self._gen_key(key) pos = self.ring.bisect(key) # if object_hash == node_hash, return node index if key in self.ring: return pos - 1 # embodies the concept of the ring. if pos == len(self.ring): return 0 return pos
class RangeModule: def __init__(self): self.data = SortedDict() def addRange(self, left: int, right: int) -> None: l, r = self.data.bisect(left), self.data.bisect(right) if l != 0: # move L to the left by 1 this will point to the lower bound l -= 1 # if the left is larger than the previous interval we need to move it up if self.data.peekitem(l)[1] < left: l += 1 if l != r: # given the adjust left and right intervals we need to check if a merge needs to happen. we take # the min of the left intervals and the max of the right intervals to maximize the interval size. left, right = min(left, self.data.peekitem(l)[0]), max(right, self.data.peekitem(r-1)[1]) # now that we have the new interval we need ot pop the redundant intervals for _ in range(l, r): self.data.popitem(l) # insert the new interval self.data[left] = right print(self.data) def queryRange(self, left: int, right: int) -> bool: l, r = self.data.bisect_right(left), self.data.bisect_right(right) print("l == 0: ",(l == 0), "self.data.peekitem(l-1)[1] < right: ",(self.data.peekitem(l-1)[1] < right)) if l == 0 or self.data.peekitem(l-1)[1] < right: return False return True def removeRange(self, left: int, right: int) -> None: l, r = self.data.bisect_right(left), self.data.bisect_right(right) if l != 0: l -= 1 if self.data.peekitem(l)[1] < left: l += 1 if l != r: ll, rr = min(left, self.data.peekitem(l)[0]), max(right, self.data.peekitem(r-1)[1]) for _ in range(l, r): self.data.popitem(l) if ll < left: self.data[ll] = left if right < rr: self.data[right] = rr
class RangeModuleDict: def __init__(self): self.data = SortedDict() def addRange(self, left: int, right: int) -> None: l = self.data.bisect(left) r = self.data.bisect(right) if l != 0: l -= 1 if self.data.peekitem(l)[1] < left: l += 1 if l != r: left = min(left, self.data.peekitem(l)[0]) right = max(right, self.data.peekitem(r - 1)[1]) for _ in range(l, r): self.data.popitem(l) self.data[left] = right def queryRange(self, left: int, right: int) -> bool: l = self.data.bisect_right(left) r = self.data.bisect_right(right) if l == 0 or self.data.peekitem(l - 1)[1] < right: return False return True def removeRange(self, left: int, right: int) -> None: l = self.data.bisect_right(left) r = self.data.bisect_right(right) if l != 0: l -= 1 if self.data.peekitem(l)[1] < left: l += 1 if l != r: minLeft = min(left, self.data.peekitem(l)[0]) maxRight = max(right, self.data.peekitem(r - 1)[1]) for _ in range(l, r): self.data.popitem(l) if minLeft < left: self.data[minLeft] = left if right < maxRight: self.data[right] = maxRight
def test_bisect(): mapping = [(val, pos) for pos, val in enumerate(string.ascii_lowercase)] temp = SortedDict(mapping) assert temp.bisect_left('a') == 0 assert temp.bisect_right('f') == 6 assert temp.bisect('f') == 6
class SLIM(BaseMiner, MDLOptimizer): """SLIM: Directly Mining Descriptive Patterns SLIM looks for a compressed representation of transactional data. This compressed representation if a set of descriptive patterns, and can be used to: - provide a natively interpretable modeling of this data - make predictions on new data, using this condensed representation as an encoding scheme Idea of early stopping is inspired from http://eda.mmci.uni-saarland.de/pres/ida14-slimmer-poster.pdf Parameters ---------- n_iter_no_change: int, default=100 Number of candidate evaluation with no improvement to count before stopping optimization. tol: float, default=None Tolerance for the early stopping, in bits. When the compression size is not improving by at least `tol` for `n_iter_no_change` iterations, the training stops. Default to None, will be automatically computed considering the size of input data. pruning: bool, default=True Either to activate pruning or not. Pruned itemsets may be useful at prediction time, so it is usually recommended to set it to False to build a classifier. The model will be less concise, but will lead to more accurate predictions on average. Examples -------- >>> from skmine.itemsets import SLIM >>> D = [['bananas', 'milk'], ['milk', 'bananas', 'cookies'], ['cookies', 'butter', 'tea']] >>> SLIM().fit(D).codetable # doctest: +SKIP (butter, tea) [2] (milk, bananas) [0, 1] (cookies) [1, 2] dtype: object References ---------- .. [1] Smets, K & Vreeken, J "Slim: Directly Mining Descriptive Patterns", 2012 .. [2] Gandhi, M & Vreeken, J "Slimmer, outsmarting Slim", 2014 """ def __init__(self, *, n_iter_no_change=100, tol=None, pruning=True): self.n_iter_no_change = n_iter_no_change self.tol = tol self.standard_codetable_ = None self.codetable_ = SortedDict() self.model_size_ = None # L(CT|D) self.data_size_ = None # L(D|CT) self.pruning = pruning def fit(self, D, y=None): # pylint:disable = too-many-locals """fit SLIM on a transactional dataset This generate new candidate patterns and add those which improve compression, iteratibely refining ``self.codetable_`` Parameters ------- D: pd.DataFrame Transactional dataset, either as an iterable of iterables or encoded as tabular binary data """ self._prefit(D, y=y) n_iter_no_change = 0 seen_cands = set() tol = self.tol or self.standard_codetable_.map(len).median() while n_iter_no_change < self.n_iter_no_change: candidates = self.generate_candidates(stack=seen_cands) for cand, _ in candidates: data_size, model_size, update_d, prune_set = self.evaluate( cand) diff = (self.model_size_ + self.data_size_) - (data_size + model_size) if diff > 0.01: # underflow self.codetable_.update(update_d) if self.pruning: self.codetable_, data_size, model_size = self._prune( self.codetable_, prune_set, model_size, data_size) self.data_size_ = data_size self.model_size_ = model_size if diff < tol: n_iter_no_change += 1 if n_iter_no_change > self.n_iter_no_change: break # inner break if not candidates: # if empty candidate generation n_iter_no_change += self.n_iter_no_change # force while loop to break return self def decision_function(self, D): """Compute covers on new data, and return code length This function function is named ``decision_function`` because code lengths represent the distance between a point and the current codetable. Setting ``pruning`` to False when creating the model is recommended to cover unseen data, and especially when building a classifier. Parameters ---------- D: pd.DataFrame or np.ndarray new data to make predictions on, in tabular format Example ------- >>> from skmine.itemsets import SLIM; import pandas as pd >>> def to_tabular(D): return pd.Series(D).str.join('|').str.get_dummies(sep="|") >>> D = [['bananas', 'milk'], ['milk', 'bananas', 'cookies'], ['cookies', 'butter', 'tea']] >>> new_D = to_tabular([['cookies', 'butter']]) >>> slim = SLIM().fit(to_tabular(D)) >>> slim.decision_function(new_D) 0 -1.321928 dtype: float32 """ D = _check_D(D) codetable = pd.Series(self.codetable_) D_sct = { k: Bitmap(np.where(D[k])[0]) for k in D.columns if k in self.standard_codetable_ } covers = cover(D_sct, codetable.index) mat = np.zeros(shape=(len(D), len(covers))) for idx, tids in enumerate(covers.values()): mat[tids, idx] = 1 mat = pd.DataFrame(mat, columns=covers.keys()) code_lengths = codetable.map(len) ct_codes = code_lengths / code_lengths.sum() codes = (mat * ct_codes).sum(axis=1).astype(np.float32) # positive sign on log2 to return negative distance : sklearn] r = _log2(codes) r[r == 0] = -np.inf # zeros would fool a `shortest code wins` strategy return r def generate_candidates(self, stack=None, thresh=1e3): """ Generate candidates from the current codetable (SLIM is any-time) Note that `stack` is updated during the execution of this method. Parameters ---------- stack: set[frozenset], default=None a stack of already-seen candidates to be excluded thresh: int, default=1_000 if the size of the current codetable is higher than `thresh`, candidate are generated on-the-fly, and remain unsorted. If not, they are returned in a list, sorted by decreasing order of estimated gain Returns ------- iterator[tuple(frozenset, Bitmap)] """ ct = SortedDict(self._standard_candidate_order, self.codetable.items()) # if big number of elements in codetable, just take a generator, do not sort output gen = generate_candidates if len( ct) < thresh else generate_candidates_big return gen(ct, stack=stack) def evaluate(self, candidate): """ Evaluate ``candidate``, considering the current codetable and a dataset ``D`` Parameters ---------- candidate: frozenset a new candidate to be evaluated Returns ------- (float, float, dict, set) updated (data size, model size, codetable) and finally the set of itemsets for which usage decreased """ idx = self.codetable_.bisect(candidate) ct = list(self.codetable_) ct.insert(idx, candidate) D = {k: v.copy() for k, v in self.standard_codetable_.items()} CTc = cover(D, ct) updated, decreased = {candidate: CTc[candidate]}, set() for iset, usage in self.codetable_.items( ): # TODO useless is size is too big if usage != CTc[iset]: updated[iset] = CTc[iset] if len(CTc[iset]) < len(usage): decreased.add(iset) data_size, model_size = self._compute_sizes( CTc) # TODO pruning in evaluate return data_size, model_size, updated, decreased def reconstruct(self): """reconstruct the original data from the current `self.codetable_`""" return reconstruct(self.codetable_) @lru_cache(maxsize=1024) def get_support(self, itemset): """Get support from an itemset""" U = reduce(Bitmap.intersection, self.standard_codetable_.loc[itemset]) return len(U) def _standard_cover_order(self, itemset): """ Returns a tuple associated with an itemset, so that many itemsets can be sorted in Standard Cover Order """ return (-len(itemset), -self.get_support(itemset), tuple(itemset)) def _standard_candidate_order(self, itemset): return (-self.get_support(itemset), -len(itemset), tuple(itemset)) def _prefit(self, D, y=None): if hasattr(D, 'ndim') and D.ndim == 2: D = _check_D(D) if y is not None: D = supervised_to_unsupervised(D, y) # SKLEARN_COMPAT item_to_tids = {k: Bitmap(np.where(D[k])[0]) for k in D.columns} else: item_to_tids = _to_vertical(D) self.standard_codetable_ = pd.Series(item_to_tids) usage = self.standard_codetable_.map(len).astype(np.uint32) ct_it = ((frozenset([e]), tids) for e, tids in item_to_tids.items()) self.codetable_ = SortedDict(self._standard_cover_order, ct_it) codes = -_log2(usage / usage.sum()) # L(code_ST(X)) = L(code_CT(X)), because CT=ST self.model_size_ = 2 * codes.sum() self.data_size_ = (codes * usage).sum() return self def _get_standard_codes(self, index): """compute the size of a codetable index given the standard codetable""" flat_items = list(chain(*index)) items, counts = np.unique(flat_items, return_counts=True) usages = self.standard_codetable_.loc[items].map(len).astype(np.uint32) usages /= usages.sum() codes = -_log2(usages) return codes * counts def _compute_sizes(self, codetable): """ Compute sizes for both the data and the model .. math:: L(D|CT) .. math:: L(CT|D) Parameters ---------- codetable : Mapping A series mapping itemsets to their usage tids Returns ------- tuple(float, float) (data_size, model_size) """ isets, usages = zip(*((_[0], len(_[1])) for _ in codetable.items() if len(_[1]) > 0)) usages = np.array(usages, dtype=np.uint32) codes = -_log2(usages / usages.sum()) stand_codes = self._get_standard_codes(isets) model_size = stand_codes.sum() + codes.sum( ) # L(CTc|D) = L(X|ST) + L(X|CTc) data_size = (codes * usages).sum() return data_size, model_size def _prune(self, codetable, prune_set, model_size, data_size): """post prune a codetable considering itemsets for which usage has decreased Parameters ---------- codetable: SortedDict prune_set: set itemsets in ``codetable`` for which usage has decreased model_size: float current model_size for ``codetable`` data_size: float current data size when encoding ``D`` with ``codetable`` Returns ------- new_codetable, new_data_size, new_model_size: SortedDict, float, float a tuple containing the pruned codetable, and new model size and data size w.r.t this new codetable """ prune_set = {k for k in prune_set if len(k) > 1} # remove singletons while prune_set: cand = min(prune_set, key=lambda e: len(codetable[e])) prune_set.discard(cand) ct = list(codetable) ct.remove(cand) D = {k: v.copy() for k, v in self.standard_codetable_.items() } # TODO avoid data copies CTp = cover(D, ct) decreased = { k for k, v in CTp.items() if len(k) > 1 and len(v) < len(codetable[k]) } d_size, m_size = self._compute_sizes(CTp) if d_size + m_size < model_size + data_size: codetable.update(CTp) del codetable[cand] prune_set.update(decreased) data_size, model_size = d_size, m_size return codetable, data_size, model_size
class StepVector(): @classmethod def sliced(cls, other, start, end): newobj = cls(other.datatype, _tree=other._t, _bounds=(start, end)) return newobj def __init__(self, datatype, _tree=None, _bounds=None): self.datatype = datatype if _tree is not None: self._t = _tree else: self._t = SortedDict() if _bounds is not None: self._bounds = _bounds else: self._bounds = (None, None) # set upon slicing/subsetting def __getitem__(self, key): if type(key) == slice: if (key.step is not None) and (key.step != 1): raise ValueError("Invalid step value") start = key.start end = key.stop if self._bounds[0] is not None: if start is None: start = self._bounds[0] else: if start < self._bounds[0]: raise ValueError("Start out of bounds") if self._bounds[1] is not None: if end is None: end = self._bounds[1] else: if end > self._bounds[1]: raise ValueError("End out of bounds") return self.sliced(self, start, end) else: assert type(key) == int if self._bounds[0] is not None: if key < self._bounds[0]: raise ValueError("Key out of bounds") if self._bounds[1] is not None: if key >= self._bounds[0]: raise ValueError("Key out of bounds") if self._t: try: prevkey = self._floor_key(key) return self._t[prevkey] except KeyError: # no item smaller than or equal to key return self.datatype() else: # empty tree return self.datatype() def __setitem__(self, key, value): if type(key) == slice: start = key.start end = key.stop else: assert type(key) == int start = key end = key + 1 assert start is not None assert end is not None assert type(value) == self.datatype assert end >= start if start == end: return # check next val if self._t: try: nkey = self._floor_key(end, bisect="right") nvalue = self._t[nkey] except KeyError: nkey = None nvalue = None else: # empty tree nkey = None nvalue = None # check prev val if self._t: try: pkey = self._floor_key(start) pvalue = self._t[pkey] except KeyError: pkey = None pvalue = None else: pkey = None pvalue = None # remove intermediate steps if any if self._t: a = self._t.bisect_left(start) b = self._t.bisect(end) assert a <= b del self._t.iloc[a:b] # set an end marker if necessary if nkey is None: self._t[end] = self.datatype() elif nvalue != value: self._t[end] = nvalue # set a start marker if necessary if pkey is None or pvalue != value: self._t[start] = value def __iter__(self): start, end = self._bounds if not self._t: # empty tree if start is None or end is None: raise StopIteration # FIXME: can't figure out a better thing to do if only one is None else: if start < end: yield (start, end, self.datatype()) raise StopIteration if start is None: a = 0 else: a = max(0, self._bisect_right(start) - 1) if end is None: b = len(self._t) else: b = self._bisect_right(end) assert b >= a if a == b: if a is None: start = self._t[a] if b is None: end = self._t[b] if start < end: yield (start, end, self.datatype()) raise StopIteration it = self._t.islice(a, b) currkey = next(it) currvalue = self._t[currkey] if start is not None: currkey = max(start, currkey) if start < currkey: yield (start, currkey, self.datatype()) prevkey, prevvalue = currkey, currvalue for currkey in it: currvalue = self._t[currkey] yield (prevkey, currkey, prevvalue) prevkey = currkey prevvalue = currvalue if end is not None: if currkey < end: yield (currkey, end, prevvalue) def add_value(self, start, end, value): assert type(value) == self.datatype # can't modify self while iterating over values; will change the tree, and thus f**k up iteration items = list(self[start:end]) for a, b, x in items: if self.datatype == set: y = x.copy() y.update(value) else: y = x + value self[a:b] = y def _bisect_left(self, key): return self._t.bisect_left(key) def _bisect_right(self, key): return self._t.bisect_right(key) def _floor_key(self, key, bisect="left"): """ Returns the greatest key less than or equal to key """ if bisect == "right": p = self._bisect_right(key) else: p = self._bisect_left(key) if p == 0: raise KeyError else: return self._t.iloc[p - 1]
class SymReader: """ Helper class that looks for symbols in multiple binaries given a memory map where they are loaded and an address. """ def __init__(self, vmmap, path): self.vmmap = vmmap """Memory map used to locate the start address of sections.""" self.paths = path """Search paths for binaries that contain symbols.""" self._symbol_map = SortedDict() """Internal mapping of address to symbol name""" self.caprelocs = {} """Capability relocations, populated by fetch caprelocs""" self.captable_mappings = SortedDict() """Hold capability table mappings as start => (end, file)""" self._capreloc_fmt = struct.Struct(">5Q") """ 5 uint64 in order: reloc_target, object, offset, size, perms """ self.loaded = [] """List of loaded executables""" with ProgressTimer("Load symbols", logger): self._load_mapped() logger.debug("Symbol map %s", self) def _find_elf(self, vme_path): fname = os.path.basename(vme_path) if not fname: return None logger.debug("looking for %s", fname) for path in self.paths: bin_file = os.path.join(path, fname) if os.path.exists(bin_file): logger.info("Found symbols for %s (%s)", fname, bin_file) return bin_file logger.debug("No ELF found for %s", bin_file) return None def map_base(self, vme_path): """ Find the base address where this file has been mapped in the vmmap """ lower_addr = np.inf for vme in self.vmmap.get_model(): if (os.path.basename(vme.path) == os.path.basename(vme_path) and vme.start < lower_addr): lower_addr = vme.start if lower_addr == np.inf: return None return lower_addr def _load_mapped(self): for vme in self.vmmap.get_model(): bin_file = self._find_elf(vme.path) if bin_file is None or bin_file in self.loaded: # is the file already been loaded? continue self.loaded.append(bin_file) elf_file = ELFFile(open(bin_file, "rb")) symtab = elf_file.get_section_by_name(".symtab") # do we need to relocate the addresses? if elf_file.header["e_type"] == "ET_DYN": map_base = self.map_base(vme.path) else: map_base = 0 for sym in symtab.iter_symbols(): if sym["st_shndx"] != ENUM_ST_SHNDX["SHN_UNDEF"]: self._symbol_map[map_base + sym["st_value"]] = (sym.name, bin_file) kern_image = self._find_elf("kernel") kern_full = self._find_elf("kernel.full") if kern_full is not None: self.loaded.append(kern_full) self._load_kernel(kern_full) elif kern_image is not None: self.loaded.append(kern_image) self._load_kernel(kern_image) def _load_kernel(self, path): elf_file = ELFFile(open(path, "rb")) symtab = elf_file.get_section_by_name(".symtab") # the kernel should not be ET_DYN assert elf_file.header["e_type"] != "ET_DYN" for sym in symtab.iter_symbols(): self._symbol_map[sym["st_value"]] = (sym.name, path) def __str__(self): data = io.StringIO() data.write("SymReader loaded symbols:\n") for addr, (sym, fname) in self._symbol_map.items(): data.write("0x{:x} {} {}\n".format(addr, fname, sym)) return data.getvalue() def fetch_caprelocs(self): """ Populate the caprelocs map """ for bin_file in self.loaded: elf_file = ELFFile(open(bin_file, "rb")) # grab __cap_relocs section relocs = elf_file.get_section_by_name("__cap_relocs") if relocs is None: logger.info("No capability relocations for %s", bin_file) continue # do we need to relocate the addresses? if elf_file.header["e_type"] == "ET_DYN": map_base = self.map_base(bin_file) else: map_base = 0 unpacked_relocs = self._capreloc_fmt.iter_unpack(relocs.data()) nrelocs = 0 for reloc in unpacked_relocs: # caprelocs[target] = [base, offset, length, perms] nrelocs += 1 self.caprelocs[map_base + reloc[0]] = reloc[1:] logger.info("Found caprelocs for %s, %d entries", bin_file, nrelocs) def fetch_cap_tables(self): for path in self.loaded: elf = ELFFile(open(path, "rb")) if elf.header["e_type"] == "ET_DYN": map_base = self.map_base(path) else: map_base = 0 assert map_base is not None # grab section with given name captable = elf.get_section_by_name(".cap_table") if captable is None: logger.info("No capability table for %s", path) continue sec_start = captable["sh_addr"] + map_base sec_end = sec_start + captable["sh_size"] logger.info("Found capability table %s @ [0x%x, 0x%x]", path, sec_start, sec_end) self.captable_mappings[sec_start] = {"end": sec_end, "path": path} def get_captable(self, addr): index = self.captable_mappings.bisect(addr) - 1 if index < 0: return None key = self.captable_mappings.iloc[index] if addr > self.captable_mappings[key]["end"]: return None return self.captable_mappings[key]["path"] def find_file(self, addr): """ Find the file where the symbol at the given address is defined. """ try: sym, fname = self.symbol_map[addr] return fname except KeyError: return None def find_symbol(self, addr): """ Return the symbol where the address is found, if possible. """ entry = self.find_address(addr) if entry: return entry[0] return None def find_address(self, addr): """ Return the symbol and file where the address is found. """ try: sym, fname = self._symbol_map[addr] return (sym, os.path.basename(fname)) except KeyError: return None def find_function(self, addr): """ Return the symbol and file of the function containing the given address, if possible. """ index = self._symbol_map.bisect(addr) - 1 if index < 0: return None key = self._symbol_map.iloc[index] sym, fname = self._symbol_map[key] return (sym, os.path.basename(fname))
class PiecewiseConstantFunction(Generic[T]): def __init__(self, initial_value: float = 0) -> None: """ Initialize the constant function to a particular value :param initial_value: the starting value for the function """ self.breakpoints = SortedDict() self._initial_value: float = initial_value def add_breakpoint(self, xval: XValue[T], yval: float, squash: bool = True) -> None: """ Add a breakpoint to the function and update the value Let f(x) be the original function, and next_bp be the first breakpoint > xval; after calling this method, the function will be modified to f'(x) = yval for x \in [xval, next_bp) :param xval: the x-position of the breakpoint to add/modify :param yval: the value to set the function to at xval :param squash: if True and f(xval) = yval before calling this method, the function will remain unchanged """ if squash and self.call(xval) == yval: return self.breakpoints[xval] = yval def add_delta(self, xval: XValue[T], delta: float) -> None: """ Modify the function value for x >= xval Let f(x) be the original function; After calling this method, the function will be modified to f'(x) = f(x) + delta for all x >= xval :param xval: the x-position of the breakpoint to add/modify :param delta: the amount to shift the function value by at xval """ if delta == 0: return if xval not in self.breakpoints: self.breakpoints[xval] = self.call(xval) for x in self.breakpoints.irange(xval): self.breakpoints[x] += delta self.values.cache_clear() self.integrals.cache_clear() def call(self, xval: XValue[T]) -> float: """ Compute the output of the function at a point :param xval: the x-position to compute :returns: f(xval) """ if len(self.breakpoints) == 0 or xval < self.breakpoints.keys()[0]: return self._initial_value else: lower_index = self.breakpoints.bisect(xval) - 1 return self.breakpoints.values()[lower_index] def _breakpoint_info( self, index: Optional[int] ) -> Tuple[Optional[int], Optional[XValue[T]], float]: """ Helper function for computing breakpoint information :param index: index of the breakpoint to compute :returns: (index, breakpoint, value) * index is the breakpoint index (if it exists), or None if we're off the end * breakpoint is the x-value of the breakpoint, or None if we're off the end * value is f(breakpoint), or f(last_breakpoint) if we're off the end """ try: breakpoint, value = self.breakpoints.peekitem(index) except IndexError: index = None breakpoint, value = None, self.breakpoints.values()[-1] return (index, breakpoint, value) @lru_cache(maxsize=_LRU_CACHE_SIZE ) # cache results of calls to this function def values(self, start: XValue[T], stop: XValue[T], step: XValueDiff[T]) -> 'SortedDict[XValue[T], float]': """ Compute a sequence of values of the function This is more efficient than [self.call(xval) for xval in range(start, stop, step)] because each self.call(..) takes O(log n) time due to the binary tree structure of self._breakpoints. This method can compute the range of values in linear time in the range, which is significantly faster for large value ranges. :param start: lower bound of value sequence :param stop: upper bound of value sequence :param step: width between points in the sequence :returns: a SortedDict of the values of the function between start and stop, with the x-distance between each data-point equal to `step`; like normal "range" functions the right endpoint is not included """ step = step or (stop - start) if len(self.breakpoints) == 0: num_values = int(math.ceil((stop - start) / step)) return SortedDict([(start + step * i, self._initial_value) for i in range(num_values)]) curr_xval = start curr_value = self.call(start) next_index, next_breakpoint, next_value = self._breakpoint_info( self.breakpoints.bisect(start)) sequence = SortedDict() while curr_xval < stop: sequence[curr_xval] = curr_value next_xval = min(stop, curr_xval + step) while next_breakpoint and next_xval >= next_breakpoint: assert next_index is not None # if next_breakpoint is set, next_index should also be set curr_value = next_value next_index, next_breakpoint, next_value = self._breakpoint_info( next_index + 1) curr_xval = next_xval return sequence @lru_cache(maxsize=_LRU_CACHE_SIZE ) # cache results of calls to this function def integrals( self, start: XValue[T], stop: XValue[T], step: XValueDiff[T], transform: Callable[[XValueDiff[T]], float] = lambda x: cast(float, x), ) -> 'SortedDict[XValue[T], float]': """ Compute a sequence of integrals of the function :param start: lower bound of integral sequence :param stop: upper bound of integral sequence :param step: width of each "chunk" of the integral sequence :param transform: function to apply to x-widths before computing the integral :returns: a SortedDict of the numeric integral values of the function between start and stop; each integral has a range of size `step`, and the key-value is the left endpoint of the chunk """ step = step or (stop - start) if len(self.breakpoints) == 0: # If there are no breakpoints, just split up the range into even widths and compute # (width * self._initial_value) for each chunk. step_width = transform(step) range_width = transform(stop - start) num_full_chunks = int(range_width // step_width) sequence = SortedDict([(start + step * i, step_width * self._initial_value) for i in range(num_full_chunks)]) # If the width does not evenly divide the range, compute the last chunk separately if range_width % step_width != 0: sequence[ start + step * num_full_chunks] = range_width % step_width * self._initial_value return sequence # Set up starting loop parameters curr_xval = start curr_value = self.call(start) next_index, next_breakpoint, next_value = self._breakpoint_info( self.breakpoints.bisect(start)) # Loop through the entire range and compute the integral of each chunk sequence = SortedDict() while curr_xval < stop: orig_xval = curr_xval next_xval = min(stop, curr_xval + step) # For each breakpoint in [curr_xval, next_xval), compute the area of that sub-chunk next_integral: float = 0 while next_breakpoint and next_xval >= next_breakpoint: assert next_index is not None # if next_breakpoint is set, next_index should also be set next_integral += transform(next_breakpoint - curr_xval) * curr_value curr_xval = next_breakpoint curr_value = next_value next_index, next_breakpoint, next_value = self._breakpoint_info( next_index + 1) # Handle any remaining width between the last breakpoint and the end of the chunk next_integral += transform(next_xval - curr_xval) * curr_value sequence[orig_xval] = next_integral curr_xval = next_xval return sequence def integral( self, start: XValue[T], stop: XValue[T], transform: Callable[[XValueDiff[T]], float] = lambda x: cast(float, x), ) -> float: """ Helper function to compute the integral of the whole specified range :param start: lower bound of the integral :param stop: upper bound of the integral :returns: the integral of the function between start and stop """ return self.integrals(start, stop, (stop - start), transform).values()[0] def __str__(self) -> str: ret = f'{self._initial_value}, x < {self.breakpoints.keys()[0]}\n' for xval, yval in self.breakpoints.items(): ret += f'{yval}, x >= {xval}\n' return ret def __add__( self, other: 'PiecewiseConstantFunction[T]' ) -> 'PiecewiseConstantFunction[T]': new_func: 'PiecewiseConstantFunction[T]' = PiecewiseConstantFunction( self._initial_value + other._initial_value) for xval, y0, y1 in _merged_breakpoints(self, other): new_func.add_breakpoint(xval, y0 + y1) return new_func def __sub__( self, other: 'PiecewiseConstantFunction[T]' ) -> 'PiecewiseConstantFunction[T]': new_func: 'PiecewiseConstantFunction[T]' = PiecewiseConstantFunction( self._initial_value - other._initial_value) for xval, y0, y1 in _merged_breakpoints(self, other): new_func.add_breakpoint(xval, y0 - y1) return new_func def __mul__( self, other: 'PiecewiseConstantFunction[T]' ) -> 'PiecewiseConstantFunction[T]': new_func: 'PiecewiseConstantFunction[T]' = PiecewiseConstantFunction( self._initial_value * other._initial_value) for xval, y0, y1 in _merged_breakpoints(self, other): new_func.add_breakpoint(xval, y0 * y1) return new_func def __truediv__( self, other: 'PiecewiseConstantFunction[T]' ) -> 'PiecewiseConstantFunction[T]': try: new_func: 'PiecewiseConstantFunction[T]' = PiecewiseConstantFunction( self._initial_value / other._initial_value) except ZeroDivisionError: new_func = PiecewiseConstantFunction() for xval, y0, y1 in _merged_breakpoints(self, other): try: new_func.add_breakpoint(xval, y0 / y1) except ZeroDivisionError: new_func.add_breakpoint(xval, 0) return new_func
class ModelDelayWrapper: """A wrapper around QuadrotorModel that delays observations, actions and introduces jitter to the controller execution. """ __slots__ = "delay_scale", "model", "controller_jitter", "observation_delay", "observation_jitter", \ "action_delay", "action_jitter", "controller_period", "past_states", "future_actions", "time", def __init__(self, model: QuadrotorModel, delay_scale: float = 1, controller_jitter: Union[float, str] = 0, controller_period: float = 0.01, observation_delay: float = 0, observation_jitter: Union[float, str] = 0, action_delay: float = 0, action_jitter: Union[float, str] = 0): """ Creates a new ModelDelayWrapper wrapping the specified model. All times specified in this class are in seconds. Note: All jitters can either be specified as a float, in which case they are interpreted as a standard deviation of a mean centered gauss distribution from which to draw them, or as a path to a text file that can be loaded using np.loadtxt. The jitters are then generated by drawing random elements from the resulting numpy array. This feature is useful if you measured the jitter in your real hardware system and want to reproduce the jitter distribution in simulation as closely as possible. :param model: The model to wrap. :param delay_scale: Global scaling factor by which all other delays are multiplied. Default is 1. :param controller_jitter: The amount of jitter to apply to the control period. :param controller_period: The duration between calls to the controller. The simulation will advance this much on each call to the step method. :param observation_delay: The age of the state that is to be observed. :param observation_jitter: The amount of jitter in the age of the state that is observed. :param action_delay: The amount of time that has to pass before an action is applied to the system. :param action_jitter: The jitter to be added to the action delay. """ if isinstance(observation_jitter, str): observation_jitter = np.loadtxt(observation_jitter) if isinstance(action_jitter, str): action_jitter = np.loadtxt(action_jitter) if isinstance(controller_jitter, str): controller_jitter = np.loadtxt(controller_jitter) observation_jitter = np.asarray(observation_jitter) action_jitter = np.asarray(action_jitter) controller_jitter = np.asarray(controller_jitter) assert observation_delay >= 0 assert action_delay >= 0 assert np.all(observation_jitter >= 0) assert np.all(action_jitter >= 0) assert delay_scale >= 0 self.delay_scale = delay_scale self.model = model self.controller_jitter = controller_jitter self.observation_delay = observation_delay self.observation_jitter = observation_jitter self.action_delay = action_delay self.action_jitter = action_jitter self.controller_period = controller_period self.past_states = SortedDict() self.future_actions = SortedDict() self.time = None def reset(self, initial_action: np.ndarray, initial_state: SysState) -> (SysState, float, float): """ Resets the state of the model delay wrapper by clearing all past states, future actions and initializing with the specified initial action and state. :param initial_action: The initial action to assume. :param initial_state: The initial state to assume :return: A triple consisting of 1. the initial observed state (which is jus the initial state that as passed), 2. the initial controller period and 3. the initial observation age. """ self.past_states = SortedDict() self.future_actions = SortedDict() observation_delay = self._sample_observation_delay() self.time = observation_delay self.past_states[0] = (initial_action, initial_state) return initial_state, self._sample_controller_period( ), observation_delay def compute_past_state(self, time: float) -> SysState: """ Computes the state at some arbitrary point in the past. :param time: The time at which to compute the state. :return: The state at the time. """ # Find last computed state just before the observed state in the state history # (sbt = state before t = most recent state before t) sbt_index = self.past_states.bisect(time) - 1 assert sbt_index >= 0 t_sbt, (sbt_action, sbt) = self.past_states.peekitem(sbt_index) assert time >= t_sbt return self.model.next_state(sbt, sbt_action, time - t_sbt) def compute_current_state(self) -> SysState: """ Computes the current state of the system. :return: The current state of the system. """ return self.compute_past_state(self.time) def step(self, action: np.ndarray) -> (SysState, float, float): """ Applies an action to the delayed system and advances the time by the controller period. :param action: The action to execute (probably in the future because it is delayed). :return: A triple consisting of 1. the observed state (probably from the past), 2. the actual controller period (with jitter) 3. The age of the observed state. """ # Insert future action to the action schedule self.future_actions[self.time + self._sample_action_delay()] = action current_controller_period = self._sample_controller_period() self.time += current_controller_period self._materialize_past_states() # Sample time of observation t_obs = np.clip(self.time - self._sample_observation_delay(), 0, self.time) # Compute observed state observed_state = self.compute_past_state(t_obs) return observed_state, current_controller_period, self.time - t_obs def _materialize_past_states(self): # Compute states that now lie in the past while len(self.future_actions) > 0 and self.future_actions.peekitem( 0)[0] <= self.time: t_a, new_action = self.future_actions.popitem(0) t_sba, (old_action, sba) = self.past_states.peekitem() assert t_a - t_sba >= 0 new_state = self.model.next_state(sba, old_action, t_a - t_sba) self.past_states[t_a] = (new_action, new_state) def _sample_action_delay(self): if self.action_jitter.ndim == 0: return np.clip( self.action_delay + np.random.randn() * self.action_jitter, 0, np.inf) * self.delay_scale else: return np.clip( self.action_delay + np.random.choice(self.action_jitter), 0, np.inf) * self.delay_scale def _sample_observation_delay(self): if self.observation_jitter.ndim == 0: return np.clip(self.observation_delay + np.random.randn() * self.observation_jitter, 0, np.inf) \ * self.delay_scale else: return np.clip(self.observation_delay + np.random.choice(self.observation_jitter), 0, np.inf) \ * self.delay_scale def _sample_controller_period(self): if self.controller_jitter.ndim == 0: return np.clip( self.controller_period + np.random.randn() * self.controller_jitter, 0, np.inf) else: return np.clip( self.controller_period + np.random.choice(self.controller_jitter), 0, np.inf)
def test_bisect_key(): temp = SortedDict(modulo, 7, ((val, val) for val in range(100))) assert all(temp.bisect(val) == ((val % 10) + 1) * 10 for val in range(100)) assert all(temp.bisect_right(val) == ((val % 10) + 1) * 10 for val in range(100)) assert all(temp.bisect_left(val) == (val % 10) * 10 for val in range(100))
# iterate through whole directory for subdir, dirs, files in os.walk(photosDirectory) : for file in files : if isPhoto(file) : try : exif = getExif(os.path.join(subdir, file)) if not cameraIsValid(exif) : continue # get focal length and convert from rational data type to float focalLength = exif[FOCALLENGTH_TAG][0] / exif[FOCALLENGTH_TAG][1] # count every focal length occurence in dictionary if (focalLength in occurences) : occurences[focalLength] = occurences[focalLength] + 1 else: # find nearest index = occurences.bisect(focalLength) greater = occurences.iloc[index] smaller = occurences.iloc[index - 1] nearestFL = greater if (greater - focalLength < focalLength - smaller) else smaller occurences[nearestFL] = occurences[nearestFL] + 1 except (KeyError, TypeError, IndexError) : # there is no focal length info in image exif data (Key/Type/IndexError) pass # plot the graph position = arange(len(focalLengths)) + .5 barh(position, occurences.values(), align='center', color='#FF0000') yticks(position, occurences.keys()) xlabel('Occurrences') ylabel('Focal length') title('Focal length usage analysis')
class GrowSpaceSortedEnv(gym.Env): def __init__(self, width=DEFAULT_RES, height=DEFAULT_RES, light_dif=LIGHT_DIFFUSION): self.width = width self.height = height self.seed() self.light_dif = light_dif self.action_space = gym.spaces.Discrete( 3) # L, R, keep of light paddle self.observation_space = gym.spaces.Box(0, 255, shape=(height, width, 3), dtype=np.uint8) self.steps = 0 # data format for branches: they are indexed/sorted by x_end position and each # key has a list of values that are [y_end, x_start, y_start, children] self.branches = SortedDict() self.points = SortedDict() def seed(self, seed=None): return [np.random.seed(seed)] def light_move_R(self): if np.around( self.light_left, 1) >= 1 - LIGHT_WIDTH - LIGHT_STEP: # limit of coordinates self.light_left = 1 - LIGHT_WIDTH # stay put else: self.light_left += LIGHT_STEP # move by .1 right def light_move_L(self): if np.around(self.light_left, 1) <= LIGHT_STEP: # limit of coordinates self.light_left = 0 else: self.light_left -= LIGHT_STEP # move by .1 left def find_closest_branch(self, point_x, branches): branch_names = [] branch_distances = [] # prefilter by x if len(branches) > MAX_BRANCHES: branches_trimmed = sample(branches, MAX_BRANCHES) else: branches_trimmed = branches for branch in branches_trimmed: dist_x = branch - point_x if np.abs(dist_x) <= MAX_GROW_DIST: # we got a potential candidate - now let's check Y dist_y = self.branches[branch][0] - self.points[point_x] if np.abs(dist_y) <= MAX_GROW_DIST: dist = norm((dist_x, dist_y)) if dist <= MAX_GROW_DIST: branch_names.append(branch) branch_distances.append(dist) if len(branch_distances) == 0: return None, None argmin = np.argmin(branch_distances) return branch_names[argmin], branch_distances[argmin] def grow_plant(self): points_filtered = list( self.get_points_in_range(self.light_left - MAX_GROW_DIST, self.light_right + MAX_GROW_DIST)) branches_filtered = list( self.get_branches_in_range(self.light_left, self.light_right)) growths = {} # will have the format: [(branch, target_x)] for point in points_filtered: closest_branch, dist = self.find_closest_branch( point, branches_filtered) if closest_branch is None: continue if dist < MIN_GROW_DIST: self.points.pop(point) elif dist < MAX_GROW_DIST: if closest_branch not in growths: growths[closest_branch] = [point] else: growths[closest_branch].append(point) for branch, points in growths.items(): end_x = (branch + (sum(points) / len(points) - branch) * BRANCH_LENGTH ) # alternatively sum(poins)/len(points) branch_y = self.branches[branch][0] point_ys = [self.points[p] for p in points] end_y = branch_y + (sum(point_ys) / len(point_ys) - branch_y) * BRANCH_LENGTH while end_x in self.branches: end_x += EPSILON # keys need to be unique in branches dict self.branches[end_x] = [end_y, branch, self.branches[branch][0], 0] # update_all_branch_widths(branches) def get_points_in_range(self, start, end): return self.points.irange(start, end) # this is dark SortedDict magic def get_branches_in_range(self, start, end): return self.branches.irange(start, end) # this is dark SortedDict magic def branch_bisect_range(self, lower, upper): start = self.branches.bisect(lower) end = self.branches.bisect_right(upper) return self.branches[start:end] def get_branch_start_end_thiccness(self, end_x): end_y, start_x, start_y, children = self.branches[end_x] thicc = ir((children + 1) * BRANCH_THICCNESS * self.width) return ( (ir(start_x * self.width), ir(start_y * self.height)), (ir(end_x * self.width), ir(end_y * self.height)), thicc, ) def get_observation(self, debug_show_scatter=False): # new empty image img = np.zeros((self.height, self.width, 3), dtype=np.uint8) # place light as rectangle x1 = ir(self.light_left * self.width) x2 = ir(self.light_right * self.width) cv2.rectangle(img, pt1=(x1, 0), pt2=(x2, self.height), color=LIGHT_COLOR, thickness=-1) if debug_show_scatter: points_filtered = self.get_points_in_range(self.light_left, self.light_right) for k in list(points_filtered): x = ir(k * self.width) y = ir(self.points[k] * self.height) cv2.circle(img, center=(x, y), radius=POINT_RADIUS, color=POINT_COLOR, thickness=-1) # Draw plant as series of lines (1 branch = 1 line) for branch_x_end in self.branches.keys(): start, end, thiccness = self.get_branch_start_end_thiccness( branch_x_end) cv2.line(img, pt1=start, pt2=end, color=PLANT_COLOR, thickness=thiccness) # place goal as filled circle with center and radius # also important - place goal last because must be always visible x = ir(self.target[0] * self.width) y = ir(self.target[1] * self.height) cv2.circle(img, center=(x, y), radius=ir(0.03 * self.width), color=(0, 0, 255), thickness=-1) # flip image, because plant grows from the bottom, not the top img = cv2.flip(img, 0) return img def reset(self): random_start = np.random.rand() # is in range [0,1 self.branches.clear() self.points.clear() self.branches[random_start] = [FIRST_BRANCH_HEIGHT, random_start, 0, 0] self.target = [np.random.uniform(0, 1), np.random.uniform(0.8, 1)] if random_start >= (1 - LIGHT_WIDTH / 2): self.light_left = 1 - LIGHT_WIDTH elif random_start <= LIGHT_WIDTH / 2: self.light_left = 0 else: self.light_left = random_start - (LIGHT_WIDTH / 2) self.light_right = self.light_left + LIGHT_WIDTH points_x = np.random.uniform(0, 1, self.light_dif) points_y = np.random.uniform(FIRST_BRANCH_HEIGHT + 0.1, 1, self.light_dif) for i in range(self.light_dif): while points_x[i] in self.points: points_x[i] += EPSILON self.points[points_x[i]] = points_y[i] self.steps = 0 return self.get_observation() def step(self, action): # Two possible actions, move light left or right if action == 0: self.light_move_L() if action == 1: self.light_move_R() self.light_right = self.light_left + LIGHT_WIDTH if action == 2: # then we keep the light in place pass self.grow_plant() # # Calculate distance to target # reward = 1 / self.distance_target(tips) ####### TODO reward = 0 # TODO ####### TODO # Render image of environment at current state observation = self.get_observation() # image done = False # because we don't have a terminal condition misc = { } # (optional) additional information about plant/episode/other stuff, leave empty for now # print("steps:", self.steps) # sanity check self.steps += 1 return observation, reward, done, misc def render(self, mode="human", debug_show_scatter=False): # or mode="rgb_array" img = self.get_observation(debug_show_scatter) if mode == "human": cv2.imshow("plant", img) # create opencv window to show plant cv2.waitKey( 1) # this is necessary or the window closes immediately else: return img
class SLIM(BaseMiner, MDLOptimizer, InteractiveMiner): """SLIM: Directly Mining Descriptive Patterns SLIM looks for a compressed representation of transactional data. This compressed representation if a set of descriptive patterns, and can be used to: - provide a natively interpretable modeling of this data - make predictions on new data, using this condensed representation as an encoding scheme Parameters ---------- k: int, default=50 Number of non-singleton itemsets to mine. A singleton is an itemset containing a single item. pruning: bool, default=True Either to activate pruning or not. Pruned itemsets may be useful at prediction time, so it is usually recommended to set it to `False` to build a classifier. The model will be less concise, but will lead to more accurate predictions on average. n_items: int, default=200 Number of most frequent items to consider for mining. As SLIM is highly dependant from the set of symbols from which it refines its codetable, lowering this argument will significantly improve runtime. Note: The reconstruction is lossless from this set of items. If the input data has more than `n_items` items, then the reconstruction will be lossy w.r.t this input data. tol: float, default=0.5 Minimum compression gain (in bits) for a candidate to be accepted Examples -------- >>> from skmine.itemsets import SLIM >>> D = [['bananas', 'milk'], ['milk', 'bananas', 'cookies'], ['cookies', 'butter', 'tea']] >>> SLIM().fit(D).discover(singletons=True, usage_tids=True) (bananas, milk) [0, 1] (butter, tea) [2] (cookies,) [1, 2] dtype: object References ---------- .. [1] Smets, K & Vreeken, J "Slim: Directly Mining Descriptive Patterns", 2012 .. [2] Gandhi, M & Vreeken, J "Slimmer, outsmarting Slim", 2014 """ def __init__( self, *, k=50, pruning=True, n_items=200, tol=0.5, ): self.n_items = n_items self.tol = tol self.standard_codetable_ = None self.codetable_ = SortedDict() self.model_size_ = None # L(CT|D) self.data_size_ = None # L(D|CT) self.pruning = pruning self.k = k def fit(self, D, y=None): # pylint:disable = too-many-locals """fit SLIM on a transactional dataset This generate new candidate patterns and add those which improve compression, iteratibely refining ``self.codetable_`` Parameters ---------- D: iterable of iterables or array-like Transactional dataset, either as an iterable of iterables or encoded as tabular binary data """ self.prefit(D, y=y) seen_cands = set() k = 0 while k < self.k: candidates = self.generate_candidates(stack=seen_cands) for cand, _ in candidates: data_size, model_size, usages = self.evaluate(cand) diff = (self.model_size_ + self.data_size_) - (data_size + model_size) if diff >= self.tol: self.update(usages=usages, data_size=data_size, model_size=model_size) k = sum(map(lambda iset: len(iset) > 1, self.codetable_)) if k >= self.k: break if not candidates: # if empty candidate generation Warning( f"could not find `{self.k}` itemsets, try with a lower `tol`" ) break return self def decision_function(self, D): """Compute covers on new data, and return code length This function is named ``decision_function`` because code lengths represent the distance between a point and the current codetable. Setting ``pruning`` to False when creating the model is recommended to cover unseen data, and especially when building a classifier. Parameters ---------- D: pd.DataFrame or np.ndarray new data to make predictions on, in tabular format Example ------- >>> from skmine.itemsets import SLIM; import pandas as pd >>> def to_tabular(D): return pd.Series(D).str.join('|').str.get_dummies(sep="|") >>> D = [['bananas', 'milk'], ['milk', 'bananas', 'cookies'], ['cookies', 'butter', 'tea']] >>> new_D = to_tabular([['cookies', 'butter']]) >>> slim = SLIM().fit(to_tabular(D)) >>> slim.decision_function(new_D) 0 -1.321928 dtype: float32 See Also -------- cover discover """ mat = self.cover(D) code_lengths = self.discover(singletons=True, usage_tids=False) ct_codes = code_lengths / code_lengths.sum() codes = (mat * ct_codes).sum(axis=1).astype(np.float32) # positive sign on log2 to return negative distance : sklearn] r = _log2(codes) r[r == 0] = -np.inf # zeros would fool a `shortest code wins` strategy return r def generate_candidates(self, stack=set()): """ Generate candidates from the current codetable (SLIM is any-time) Note that `stack` is updated during the execution of this method. Parameters ---------- stack: set[frozenset], default=None a stack of already-seen candidates to be excluded Returns ------- iterator[tuple(frozenset, Bitmap)] """ ct = SortedDict(self._standard_candidate_order, self.codetable_.items()) return generate_candidates(ct, stack=stack) def evaluate(self, candidate): """ Evaluate ``candidate``, considering the current codetable and a dataset ``D`` Parameters ---------- candidate: frozenset a new candidate to be evaluated Returns ------- (float, float, dict) updated (data size, model size, codetable) """ idx = self.codetable_.bisect(candidate) ct = list(self.codetable_) ct.insert(idx, candidate) D = {k: v.copy() for k, v in self.standard_codetable_.items()} CTc = cover(D, ct) decreased = set() for iset, usage in self.codetable_.items( ): # TODO useless is size is too big if len(CTc[iset]) < len(usage): decreased.add(iset) data_size, model_size = self._compute_sizes(CTc) if self.pruning: CTc, data_size, model_size = self._prune(CTc, decreased, model_size, data_size) return data_size, model_size, CTc def update(self, candidate=None, model_size=None, data_size=None, usages=None): """ Update the current codetable. If `candidate` is passed as None, `model_size`, `data_size` and `usages` will be used If `candidate` is not None, `model_size`, `data_size` and `usages` will be computed by calling `.evaluate` Parameters ---------- candidate: frozenset, default=None candidate to be inserted model_size: float, default=None new model size (in bits) to be set data_size: float new data size (in bits) to be set usages: dict, default=None optional for usage outside of this class eg. if one simply needs to include an itemset in the current codetable as in interactive data mining Raises ------ AssertionError """ assert not (candidate is None and usages is None) if usages is None: data_size, model_size, usages = self.evaluate(candidate) to_drop = { c for c in self.codetable_.keys() - usages.keys() if len(c) > 1 } self.codetable_.update(usages) for iset in to_drop: del self.codetable_[iset] self.data_size_ = data_size self.model_size_ = model_size def cover(self, D): """ cover unseen data items never seen are dropped out Examples -------- >>> from skmine.itemsets import SLIM >>> D = ["ABC", "AB", "BCD"] >>> s = SLIM().fit(D) >>> s.cover(["BC", "AB"]) (A, B) (B,) (C,) 0 False True True 1 True False False Returns ------- pd.DataFrame """ if hasattr(D, "shape") and len(D.shape) == 2: # tabular D = _check_D(D) D_sct = { k: Bitmap(np.where(D[k])[0]) for k in D.columns if k in self.standard_codetable_ } else: # transactional D_sct = _to_vertical(D) isets = self.discover(singletons=True, usage_tids=False) isets = isets[isets.index.map(set(D_sct).issuperset)] covers = cover(D_sct, isets.index) mat = np.zeros(shape=(len(D), len(covers)), dtype=bool) for idx, tids in enumerate(covers.values()): mat[tids, idx] = True return pd.DataFrame(mat, columns=list(covers.keys())) def discover(self, singletons=False, usage_tids=False, drop_null_usage=True): """Get a user-friendly copy of the codetable Parameters ---------- singletons: bool, default=False Either to include itemsets of length 1 in the result usage_tids: bool, default=False Either to return transaction ids for an itemset (usage) or its codelength drop_null_usage: bool, default=True Either to include itemset with no usage in the training data (i.e itemsets under cover of other itemsets) Example ------- >>> from skmine.itemsets import SLIM >>> D = ["ABC", "AB", "BCD"] >>> SLIM().fit(D).discover(singletons=True, usage_tids=True, drop_null_usage=False) (A, B) [0, 1] (B,) [2] (A,) [] (C,) [0, 2] (D,) [2] dtype: object Returns ------- pd.Series codetable containing patterns and ids of transactions in which they are used """ s = { tuple(sorted(iset)): tids.copy() for iset, tids in self.codetable_.items() if len(tids) >= drop_null_usage and len(iset) > (not singletons) } s = pd.Series(list(s.values()), index=list(s.keys())) if not usage_tids: s = s.map(len).astype(np.uint32) return s def reconstruct(self): """reconstruct the original data from the current `self.codetable_`""" n_transactions = (max( map(Bitmap.max, filter(lambda e: e, self.codetable_.values()))) + 1) D = pd.Series([set()] * n_transactions) for itemset, tids in self.codetable_.items(): D.iloc[list(tids)] = D.iloc[list(tids)].map(itemset.union) return D.map(sorted) @lru_cache(maxsize=1024) def get_support(self, *items): """ Get support from an itemset Note ---- Items in an itemset must be passed as positional arguments Unseen items will throw errors """ a = items[-1] tids = self.standard_codetable_[a] if len(items) > 1: return tids & self.get_support(*items[:-1]) return tids def _standard_cover_order(self, itemset): """ Returns a tuple associated with an itemset, so that many itemsets can be sorted in Standard Cover Order """ return (-len(itemset), -len(self.get_support(*itemset)), tuple(itemset)) def _standard_candidate_order(self, itemset): return (-len(self.get_support(*itemset)), -len(itemset), tuple(itemset)) def prefit(self, D, y=None): """ Parameters ---------- D: iterable of iterables or array-like Transactional dataset, either as an iterable of iterables or encoded as tabular binary data Note ---- works in 3 steps 1. ingest data `D` 2. track bitmaps for the top `self.n_items` frequent items from `D` 3. set `self.data_size_` and `self.model_size` given the standard codetable """ if hasattr(D, "ndim") and D.ndim == 2: D = _check_D(D) if y is not None: D = supervised_to_unsupervised(D, y) # SKLEARN_COMPAT item_to_tids = {k: Bitmap(np.where(D[k])[0]) for k in D.columns} else: item_to_tids = _to_vertical(D) sct = pd.Series(item_to_tids) usage = sct.map(len).astype(np.uint32) usage = usage.nlargest(self.n_items) sct = sct[usage.index] self.standard_codetable_ = sct ct_it = ((frozenset([e]), tids) for e, tids in sct.items()) self.codetable_ = SortedDict(self._standard_cover_order, ct_it) codes = -_log2(usage / usage.sum()) self._starting_codes = codes # L(code_ST(X)) = L(code_CT(X)), because CT=ST self.model_size_ = 2 * codes.sum() self.data_size_ = (codes * usage).sum() return self def _compute_sizes(self, codetable): """ Compute sizes for both the data and the model .. math:: L(D|CT) .. math:: L(CT|D) Parameters ---------- codetable : Mapping A series mapping itemsets to their usage tids Returns ------- tuple(float, float) (data_size, model_size) """ isets, usages = zip(*((_[0], len(_[1])) for _ in codetable.items() if len(_[1]) > 0)) usages = np.array(usages, dtype=np.uint32) codes = -_log2(usages / usages.sum()) counts = Counter(chain(*isets)) stand_codes_sum = sum(self._starting_codes[item] * ctr for item, ctr in counts.items()) model_size = stand_codes_sum + codes.sum( ) # L(CTc|D) = L(X|ST) + L(X|CTc) data_size = (codes * usages).sum() return data_size, model_size def _prune(self, codetable, prune_set, model_size, data_size): """post prune a codetable considering itemsets for which usage has decreased Parameters ---------- codetable: SortedDict prune_set: set itemsets in ``codetable`` for which usage has decreased model_size: float current model_size for ``codetable`` data_size: float current data size when encoding ``D`` with ``codetable`` Returns ------- new_codetable, new_data_size, new_model_size: SortedDict, float, float a tuple containing the pruned codetable, and new model size and data size w.r.t this new codetable """ prune_set = {k for k in prune_set if len(k) > 1} # remove singletons while prune_set: cand = min(prune_set, key=lambda e: len(codetable[e])) prune_set.discard(cand) ct = list(codetable) ct.remove(cand) D = {k: v.copy() for k, v in self.standard_codetable_.items() } # TODO avoid data copies CTp = cover(D, ct) decreased = { k for k, v in CTp.items() if len(k) > 1 and len(v) < len(codetable[k]) } d_size, m_size = self._compute_sizes(CTp) if d_size + m_size < model_size + data_size: codetable.update(CTp) del codetable[cand] prune_set.update(decreased) data_size, model_size = d_size, m_size return codetable, data_size, model_size
class TimeSeries(TictsMagicMixin, TictsOperationMixin, PandasMixin, TictsIOMixin, TictsPlot): """ TimeSeries object. Args: default: The default value of timeseries. permissive (bool): Whether to allow accessing non-existing values or not. If is True, getting non existing item returns None. If is False, getting non existing item raises. """ _default_interpolate = "previous" _meta_keys = ('default', 'name', 'permissive') @property def index(self): return self.data.keys() @property def lower_bound(self): """Return the lower bound time index.""" if self.empty: return MINTS return self.index[0] @property def upper_bound(self): """Return the upper bound time index.""" if self.empty: return MAXTS return self.index[-1] @property def _has_default(self): return self.default != NO_DEFAULT @property def _kwargs_special_keys(self): kwargs = {} for attr_name in self._meta_keys: kwargs[attr_name] = getattr(self, attr_name) return kwargs @property def empty(self): """Return whether the TimeSeries is empty or not.""" return len(self) == 0 def __init__(self, data=None, default=NO_DEFAULT, name=DEFAULT_NAME, permissive=True, tz='UTC'): """""" if isinstance(data, self.__class__): for attr in ('data', *self._meta_keys): setattr(self, attr, getattr(data, attr)) # Only set 'default' and 'name' if is different from default if default != NO_DEFAULT: setattr(self, 'default', default) if name != DEFAULT_NAME: setattr(self, 'name', name) return if hasattr(default, 'lower') and default.lower() == 'no_default': # 'no_default' as string is used at JSON serealization time self.default = NO_DEFAULT else: self.default = default self.name = name self.permissive = permissive # Overwrite the name if data is an instance of pd.DataFrame or pd.Series if isinstance(data, pd.DataFrame): if len(data.columns) != 1: msg = ("Can't convert a DataFrame with several columns into " "one timeseries: {}.") raise ValueError(msg.format(data.columns)) self.name = data.columns[0] elif isinstance(data, pd.Series): self.name = data.name try: tz = pytz.timezone(tz) except pytz.UnknownTimeZoneError: raise ValueError('{} is not a valid timezone'.format(tz)) # SortedDict.__init__ does not use the __setitem__ # Hence we got to parse datetime keys ourselves. # SortedDict use the first arg given and check if is a callable # in case you want to give your custom sorting function. self.data = SortedDict(None, _process_args(data, tz)) def __setitem__(self, key, value): if isinstance(key, slice): return self.set_interval(key.start, key.stop, value) if key in self._meta_keys: super().__setitem__(key, value) else: key = timestamp_converter(key, self.tz) self.data[key] = value def __getitem__(self, key): """Get the value of the time series, even in-between measured values by interpolation. Args: key (datetime): datetime index interpolate (str): interpolate operator among ["previous", "linear"] """ interpolate = self._default_interpolate if isinstance(key, tuple): if len(key) == 2: key, interpolate = key elif len(key) > 2: raise KeyError if isinstance(key, slice): return self.slice(key.start, key.stop) key = timestamp_converter(key, self.tz) basemsg = "Getting {} but default attribute is not set".format(key) if self.empty: if self._has_default: return self.default else: if self.permissive: return else: raise KeyError( "{} and timeseries is empty".format(basemsg)) if key < self.lower_bound: if self._has_default: return self.default else: if self.permissive: return else: msg = "{}, can't deduce value before the oldest measurement" raise KeyError(msg.format(basemsg)) # If the key is already defined: if key in self.index: return self.data[key] if interpolate.lower() == "previous": fn = self._get_previous elif interpolate.lower() == "linear": fn = self._get_linear_interpolate else: raise ValueError("'{}' interpolation unknown.".format(interpolate)) return fn(key) def _get_previous(self, time): # In this case, bisect_left == bisect_right == bisect # And idx > 0 as we already handled other cases previous_idx = self.data.bisect(time) - 1 time_idx = self.index[previous_idx] return self.data[time_idx] def _get_linear_interpolate(self, time): # TODO: put it into a 'get_previous_index' method idx = self.data.bisect_left(time) previous_time_idx = self.index[idx - 1] # TODO: check on left bound case # out of right bound case: if idx == len(self): return self.data[previous_time_idx] next_time_idx = self.index[idx] previous_value = self.data[previous_time_idx] next_value = self.data[next_time_idx] coeff = (time - previous_time_idx) / ( next_time_idx - previous_time_idx) value = previous_value + coeff * (next_value - previous_value) return value def slice(self, start, end): # noqa A003 """Slice your timeseries for give interval. Args: start (datetime or str): lower bound end (datetime or str): upper bound Returns: TimeSeries sliced """ start = timestamp_converter(start, self.tz) end = timestamp_converter(end, self.tz) newts = TimeSeries(**self._kwargs_special_keys) for key in self.data.irange(start, end, inclusive=(True, False)): newts[key] = self[key] should_add_left_closure = (start not in newts.index and start >= self.lower_bound) if should_add_left_closure: newts[start] = self[start] # is applying get_previous on self return newts def set_interval(self, start, end, value): """Set a value for an interval of time. Args: start (datetime or str): lower bound end (datetime or str): upper bound value: the value to be set Returns: self Raises: NotImplementedError: when no default is set. """ if not self._has_default: msg = "At the moment, you have to set a default for set_interval" raise NotImplementedError(msg) start = timestamp_converter(start, self.tz) end = timestamp_converter(end, self.tz) keys = self.data.irange(start, end, inclusive=(True, False)) last_value = self[end] for key in list(keys): del self.data[key] self[start] = value self[end] = last_value def compact(self): """Convert this instance to a compact version: consecutive measurement of the same value are discarded. Returns: TimeSeries """ ts = TimeSeries(**self._kwargs_special_keys) for time, value in self.items(): should_set_it = ts.empty or (ts[time] != value) if should_set_it: ts[time] = value return ts def iterintervals(self, end=None): """Iterator that contain start, end of intervals. Args: end (datetime): right bound of last interval. """ lst_keys = SortedList(self.index) if not end: end = self.upper_bound else: end = timestamp_converter(end, self.tz) if end not in lst_keys: lst_keys.add(end) for i, key in enumerate(lst_keys[:-1]): next_key = lst_keys[i + 1] if next_key > end: # stop there raise StopIteration yield key, next_key def equals(self, other, check_default=True, check_name=True): if not isinstance(other, self.__class__): raise TypeError("Can't compare {} with {}".format( self.__class__.__name__, other.__class__.__name__)) is_equal = self.data == other.data if check_default: is_equal = is_equal and self.default == other.default if check_name: is_equal = is_equal and self.name == other.name return is_equal @property def tz(self): if self.empty: return pytz.UTC return str(self.index[0].tz) def tz_convert(self, tz): try: tz = pytz.timezone(tz) except pytz.UnknownTimeZoneError: raise ValueError('{} is not a valid timezone'.format(tz)) ts = deepcopy(self) for key in ts.index: ts[key.tz_convert(tz)] = ts.data.pop(key) return ts
def search(request): global new_dict if 'selection' in request.POST: spid = request.POST.get('selection') request.session['spid'] = spid col = {} for c in new_dict[spid]: f = 0 e = 0 for a in new_dict[spid][c]: f = f + 1 obj = SomeObject() obj.args = {spid: {c: {a}}} ob = SomeObject.objects.filter(args=obj.args).count() e = e + ob col[c] = [f, e] print(col) context = {'col': col} return render(request, "search.html", context) if 'error' in request.POST: links = {} txt = [] px = [] spid = request.session['spid'] c = request.POST.get('error') request.session['c'] = c cx = re.sub(r"\s([?.!',](?:\s|$))", r"\1", c) cx = cx.replace(" '", "'") for a in new_dict[spid][c]: obj = SomeObject() obj.args = {spid: {c: {a}}} ob = SomeObject.objects.filter(args=obj.args).count() fla = 0 links[a] = ob a = a.split("_") URL = "https://en.wikipedia.org/?curid=" + a[0] ind = sd.bisect(int(a[0])) key = sd.iloc[ind] value = str(sd[key], 'utf-8') all_files = os.listdir("extracted/" + value + '/') temp = open("extracted/" + value + "/index.txt", "rb") dic = {} for all in all_files: for line in temp: (key, val1) = line.split() dic[int(val1)] = key sdi = SortedDict((key, value) for key, value in dic.items()) ind = sdi.bisect(int(a[0])) key = sdi.iloc[ind - 1] val = str(sdi[key], 'utf-8') with bz2.open("extracted/" + value + "/" + val, "rt") as bz_file: p = [] for line in bz_file: if fla == 1: if cx in line: p.append(line.replace(cx, "<b>" + cx + "</b>")) if doc in line: n = re.search(" +id=\"(.*?)\"", line) if n.group(1) == a[0]: fla = 1 pp = re.search(" +title=\"(.*?)\"", line) px.append(pp.group(1)) else: fla = 0 #ALTERNATE WAY IF YOU WANT TO GET TEXTS FROM ONLINE LINKS #r = requests.get(URL) #soup = BeautifulSoup(r.content, 'html5lib') #abc = soup.get_text().splitlines() #cx = re.sub(r"\s([?.!',](?:\s|$))", r"\1", c) #cx = cx.replace(" '","'") #px.append(soup.title.string) #p = [line for line in soup.get_text().splitlines() if cx in line] txt.append(p) link = OrderedDict( sorted(links.items(), key=lambda t: t[1], reverse=True)) context = {'links': link, 'cols': cx, 'txt': txt, 'px': px} return render(request, "search.html", context) if 'store' in request.POST: st = request.POST.getlist('recommendations') spid = request.session['spid'] c = request.session['c'] for s in st: for a in new_dict[spid][c]: if a == s: obj = SomeObject() obj.args = {spid: {c: {a}}} obj.user = request.user if SomeObject.objects.filter( args=obj.args, user=obj.user).exists() == False: mission = "Thanks for your feedback" obj.save() else: mission = "Already Exists" context = {'mission': mission} return render(request, "search.html", context) spid = request.session['spid'] co = {} for c in new_dict[spid]: f = 0 e = 0 for a in new_dict[spid][c]: f = f + 1 obj = SomeObject() obj.args = {spid: {c: {a}}} ob = SomeObject.objects.filter(args=obj.args).count() e = e + ob co[c] = [f, e] if 'alphabet' in request.POST: col = OrderedDict(sorted(co.items(), key=lambda t: t[0])) if 'frequency' in request.POST: col = OrderedDict( sorted(co.items(), key=lambda t: t[1][0], reverse=True)) if 'marked' in request.POST: col = OrderedDict( sorted(co.items(), key=lambda t: t[1][1], reverse=True)) context = {'col': col} return render(request, "search.html", context)
# print(date) gpsinfo = exif['GPSInfo'] print(gpsinfo) (lat, lng) = gpsTuplesToFloat(gpsinfo) # print(lng) datetimeLatLng[date] = (lat,lng) # THETAの写真ファイル一覧を取得 theta_filenames = glob('R*.jpg') # THETAの写真ファイルに直近のiPhoneの写真ファイルの緯度経度から時間で内挿した緯度経度を入れる for filename in theta_filenames: exif2 = get_exif_of_image(filename) dto = dt.strptime(exif2['DateTimeOriginal'], '%Y:%m:%d %H:%M:%S') bisectDto = datetimeLatLng.bisect(dto) #print(bisectDto) # print(type(dto)) # print(type(keys[0])) item1 = datetimeLatLng.items()[max(bisectDto - 1, 0)] item2 = datetimeLatLng.items()[min(bisectDto, len(datetimeLatLng) - 1)] print(item1) print(item2) # print(dto) k = (dto - item1[0])/(item2[0] - item1[0]) lat = item1[1][0] + k * (item2[1][0] - item1[1][0]) lng = item1[1][1] + k * (item2[1][1] - item1[1][1]) print((dto, (lat,lng))) exif_dict2 = piexif.load(filename) exif_dict2['GPS'] = floatLatLngToGpsTuple((lat, lng))
for subdir, dirs, files in os.walk(photosDirectory): for file in files: if isPhoto(file): try: exif = getExif(os.path.join(subdir, file)) if not cameraIsValid(exif): continue # get focal length and convert from rational data type to float focalLength = exif[FOCALLENGTH_TAG][0] / exif[FOCALLENGTH_TAG][ 1] # count every focal length occurence in dictionary if (focalLength in occurences): occurences[focalLength] = occurences[focalLength] + 1 else: # find nearest index = occurences.bisect(focalLength) greater = occurences.iloc[index] smaller = occurences.iloc[index - 1] nearestFL = greater if (greater - focalLength < focalLength - smaller) else smaller occurences[nearestFL] = occurences[nearestFL] + 1 except (KeyError, TypeError, IndexError): # there is no focal length info in image exif data (Key/Type/IndexError) pass # plot the graph position = arange(len(focalLengths)) + .5 barh(position, occurences.values(), align='center', color='#FF0000') yticks(position, occurences.keys()) xlabel('Occurrences') ylabel('Focal length')