class SharedStringTable(AbstractSharedStringTable): """ A class to track Excel shared strings between worksheets. """ def __init__(self): self._count = 0 from bidict import OrderedBidict self._strings = OrderedBidict() @property def supports_constant_memory(self): return False @property def unique_count(self): return len(self._strings) @property def count(self): return self._count def get_index(self, string): """" Get the index of the string in the Shared String table. """ if string not in self._strings: # String isn't already stored in the table so add it. index = self.unique_count self._strings[string] = index self._count += 1 return index else: # String exists in the table. index = self._strings[string] self._count += 1 return index def get_string(self, index): """" Get a shared string from the index. """ return self._strings.inverse[index] def get_strings(self): """" Return the sorted string iterator. """ return self._strings.iterkeys()
# These test cases ensure coverage of all branches in [Ordered]BidictBase._undo_write # (Hypothesis doesn't always generate examples that cover all the branches otherwise). @example(bidict({ 1: 1, 2: 2 }), [(1, 3), (1, 2)], OnDup(key=DROP_OLD, val=RAISE)) @example(bidict({ 1: 1, 2: 2 }), [(3, 1), (2, 4)], OnDup(key=RAISE, val=DROP_OLD)) @example(bidict({ 1: 1, 2: 2 }), [(1, 2), (1, 1)], OnDup(key=RAISE, val=RAISE, kv=DROP_OLD)) @example(OrderedBidict({ 1: 1, 2: 2 }), [(1, 3), (1, 2)], OnDup(key=DROP_OLD, val=RAISE)) @example(OrderedBidict({ 1: 1, 2: 2 }), [(3, 1), (2, 4)], OnDup(key=RAISE, val=DROP_OLD)) @example(OrderedBidict({ 1: 1, 2: 2 }), [(1, 2), (1, 1)], OnDup(key=RAISE, val=RAISE, kv=DROP_OLD)) def test_putall_same_as_put_for_each_item(bi, items, on_dup): """*bi.putall(items) <==> for i in items: bi.put(i)* for all values of OnDup.""" check = bi.copy() expect = bi.copy() checkexc = None expectexc = None
def __init__(self): self._count = 0 from bidict import OrderedBidict self._strings = OrderedBidict()
class NominalDataEncoder(object): """A single class for handling the encoding of nominal data to integer values or one hot / binarized vectors. nominal_value -> int -> one_hot_vector or binary vector Consider extending bidict. Attributes ---------- encoder : OrderedBidict The bidirectional mapping of nominal value to integer encoding. There can be no multiple keyes that map to the same values. argsorted_keys : np.ndarray(int) When the keys in the encoder are not sorted, but instead saved in the order they are given, then sorted_keys_args is an array of the indices of the encoder keys in sorted order. This is necessary for encoding using numpy only when the keys are not sorted when saved into the encoder. If the keys are sorted when the encoder is created, then this is None. Notes ----- This is to provide ease-of-use for handling nominal data encodings where sklearn's label encoders were not as ergonomic. Here, "ergonomic" is about grouping the necessary parts together for handling labels so it is all in one place. This class could be implemented to wrap the sklearn's encoders, but Bidict was used instead along with sklearn functions. """ # TODO perhaps inherit from sklear...LabelEncoder, given using its code. # The idea was to extend OrderedBidict to do efficient numpy # transformations similar to how sklearn...LabelEncoder does, and to also # organize the locality of labels, their encodings, and functions for # transforming data to and from the label encodings. # # Furthermore, this is to aide in working with labels in general, esp. in # the case of complex label relationships and updating and changing # labels at certain levels of class hierarchy. So TODO: add ease of # combining NominalDataEncoders together, and this is where shift then # would come into play. def __init__( self, ordered_keys, shift=0, pos_label=1, neg_label=0, sparse_output=False, ignore_dups=False, sort_keys=False, #unknown=None, #unknown_idx=None, #unknown_key='unknown_default', ): """ Parameters ---------- ordered_keys : Iterable The keys to be added to the shift : int, optional Shifts the encoding by the given value. Can be seen as the starting value of the ordered encodings. pos_label : int The positive label to use when binarizing or one hot encoding. neg_label : in The negative label to use when binarizing or one hot encoding. sparse_output : bool ??? same as scikit LabelBinarizer learn atm. ignore_dups : bool, optional Ignores any duplicates in the given ordered keys. Not implemented! sort_keys : bool, optional """ if not ignore_dups and len(set(ordered_keys)) != len(ordered_keys): raise ValueError('There are duplicates in the given sequence') if ignore_dups: raise NotImplementedError('Ignore_dups is not yet implemented.') if sort_keys: # Sort the keys so they are in the encoder sorted, rather than # order given. ordered_keys = np.unique(ordered_keys) # np.unique sorts the keys self.argsorted_keys = None # KeySortedBidict keeps the keys sorted self.encoder = KeySortedBidict({ key: enc + shift for enc, key in enumerate(ordered_keys, shift) }) else: # Use in order given, but maintain sorted_args for encoding unique, self.argsorted_keys = np.unique( ordered_keys, return_index=True, ) # TODO probably can make ignore dups and error raise more efficient # if already using unique like this. NOTE that unique here does not # work to get argsorted_keys unless ordered_keys is already unique # keys only, which is it given the check and NOT ignore_dups self.encoder = OrderedBidict({ key: enc + shift for enc, key in enumerate(ordered_keys, shift) }) self.pos_label = pos_label self.neg_label = neg_label self.sparse_output = sparse_output # TODO need to flesh out the handling of unknowns in the enecoder """ if unknown == 'update': self.unknown_idx = unknown_idx # TODO further functionality required in [en/de]code to handle this # TODO is there a default unknown or no? self.unknown_key = unknown_key elif unknown == 'single': self.unknown_idx = unknown_idx self.unknown_key = unknown_key # TODO further functionality required in [en/de]code to handle this # TODO optionally separate unknowns from the encoding, esp. in # one_hots elif unknown is not None: raise ValueError(' '.join([ 'Expected `unknown` to be `None`, "update", or "single", but', f'"recieved": {unknown}', ])) else: self.unknown_idx = None self.unknown = unknown #""" @property def keys_sorted(self): return isinstance(self.encoder, KeySortedBidict) #@property #def unknown_key(self): # if self.unknown is None: # raise ValueError('`unknown` is None. No unknown key or encoding!') # elif self.unknown_idx is None: # raise ValueError( # '`unknown_idx` is None. No default unknown key or encoding!' # ) # return self.encoder.inverse[self.unknown_idx] def keys(self, *args, **kwargs): return self.encoder.keys(*args, **kwargs) def values(self, *args, **kwargs): return self.encoder.values(*args, **kwargs) def items(self, *args, **kwargs): return self.encoder.items(*args, **kwargs) def encode(self, keys, one_hot=False): """Encodes the given values into their respective encodings. Parameters ---------- keys : scalar or np.ndarray one_hot : bool If True, then expects to encode the keys into their respective one hot vectors. Otherwise, expects to map elements to their respective encoding values. Returns ------- scalar or np.ndarray Same shape as input keys, but with elements changed to the proper encoding. """ if one_hot: return label_binarize( keys, classes=np.array(self.encoder), pos_label=self.pos_label, neg_label=self.neg_label, sparse_output=self.sparse_output, ) keys = validation.column_or_1d(keys, warn=True) if validation._num_samples(keys) == 0: return np.array([]) # Check for unrecognized keys # TODO may be able to be more efficient? diff = set(np.unique(keys)) - set(self.encoder) if diff: # unknowns=None raise ValueError(f'`keys` contains previously unseen keys: {diff}') # TODO allow for assigning a default encoding value if unknown # label: i.e. not in the current encoder # unknowns=default; unknown_idx = 0 # e.g. convert_to_unknown=True; OR unknown_key exists? # basically set `unknown_behavior={'update', 'convert', 'error'} # TODO XOR allow for updating of the labels in order of occurrence. # XOR default is as is, fail if unseen label in encoding. # unknowns=update if keys.dtype == object: # Python encode return np.array([self.encoder[key] for key in keys]) # Numpy encode if self.keys_sorted: # Encoder keys are already sorted within the encoder. return np.searchsorted(self.encoder, keys) return self.argsorted_keys[np.searchsorted( self.encoder, keys, sorter=self.argsorted_keys, )] # TODO to get this to work w/ np.searchsorted as sklearn does it, a # sorted args of the keys must always be present. This means as the # keys change, this sorted args must also change. Otherwise, this needs # done a different way. This is the cost of having any order of keys. #return keys def decode(self, encodings, one_hot_axis=None): """Decodes the given encodings into their respective keys. Parameters ---------- encodings : scalar or np.ndarray one_hot : bool If True, then expects to decode one hot vectors into their respective keys. Otherwise, expects to map elements to their respective keys. Returns ------- scalar or np.ndarray Same shape as input encodings, but with elements changed to the proper encoding. """ if isinstance(one_hot_axis, int): encodings = encodings.argmax(axis=one_hot_axis) # TODO check encodings.shape to expected shape encodings = validation.column_or_1d(encodings, warn=True) # inverse transform of empty array is empty array if validation._num_samples(encodings) == 0: return np.array([]) diff = np.setdiff1d(encodings, np.arange(len(self.keys()))) if len(diff): raise ValueError( "encodings contains previously unseen labels: %s" % str(diff)) # TODO hard to handle unknowns in the decoding case, but could do # update or default as well, I suppose. return np.array(self.encoder)[np.array(encodings)] def shift_encoding(self, shift): """Increments or decrements all encodings by the given integer. Parameters ---------- shift : int shifts all encodings by this constant integer. """ if not isinstance(shift, int): raise TypeError(' '.join([ 'Expected `adjustment` to be type `int`, not', 'f`{type(adjustment)}`', ])) # NOTE uncertain when shift comes into play outside of maintence or # when a enc value that is off from that of array indices applies. if shift == 0: logging.debug('Shift value given was zero. No shifting done.') return for key in self.encoder: self.encoder[key] += shift def append(self, keys, ignore_dups=False): """Appends the keys to the end of the encoder giving them their respective encodings. """ # TODO handle the update to argsorted_keys, more efficiently last_enc = next(reversed(self.encoder.inverse)) if (isinstance(keys, list) or isinstance(keys, tuple) or isinstance(keys, np.ndarray)): # Add the multiple keys to the encoder in order. for key in keys: if key not in self.encoder: last_enc += 1 self.encoder[key] = last_enc if not self.keys_sorted: # Must update the argsorted_keys for approriate # encoding TODO replace this hotfix cuz this is # inefficient! self.argsorted_keys = np.argsort(self.encoder) elif ignore_dups: continue else: raise KeyError( f'Given key `{key}` is already in the NominalDecoder!', ) else: # Add individual key if keys not in self.encoder: self.encoder[keys] = last_enc + 1 if not self.keys_sorted: # Must update the argsorted_keys for approriate encoding # TODO replace this hotfix cuz this is inefficient! self.argsorted_keys = np.argsort(self.encoder) elif ignore_dups: return else: raise KeyError( f'Given key `{keys}` is already in the NominalDecoder!', ) def reorder(self, keys): """Reorder the keys""" raise NotImplementedError() # TODO reorder by new sequence of keys (equivalent to making a new # NDEnc but preserving the shift, if there is any, which now may be a # depracted thing anyways, so reorder would be superfulous in this case # partial reorder, as in swapping class locations, may still be useful. def pop(self, key, encoding=False): """Pops the single key and updates the encoding as necessary.""" # NOTE pop key, but then requires updating the rest of the following # keys, while if this was done by a list, it would be handled by # shifting the array and index mapping done automatically... but then # again, iirc, the index mapping runs into a similar issue wrt to # getting the index of the keys. # TODO handle the update to argsorted_keys # Handle the shift in encoding if there is any. shift = next(iter(self.encoder.inverse)) # Obtain the last encoding last_enc = next(reversed(self.encoder.inverse)) if not self.keys_sorted: # Must remove the key's respective arg from argsorted_keys arg = np.argwhere( np.array(self.encoder) == ( self.encoder.inverse[key] if encoding else key))[0][0] self.argsorted_keys = np.delete(self.argsorted_keys, arg) # adjust the rest of the args accordingly self.argsorted_keys[np.where(self.argsorted_keys > arg)] -= 1 # Remove the given key, whether it is a key or encoding if encoding: enc = key key = self.encoder.inverse.pop(key) else: enc = self.encoder.pop(key) if enc != last_enc: # Decrement all following keys by one for k in list(self.encoder)[enc - shift:]: self.encoder[k] -= 1 return key if encoding else enc # TODO efficiently handle the popping of a sequence of keys and the # updating of the encoding. # TODO consider an insert_after(), inplace of a append() then reorder() def save(self, filepath, sep=None): """Saves the labels as an ordered list where the index is implied by the order of the labels. """ if sep is None: with open(filepath, 'w') as openf: openf.write('\n'.join([str(x) for x in self.encoder])) else: raise NotImplementedError(' '.join([ 'Saving as any file using separators other than newlines', 'between the labels is not yet supported.', ])) @staticmethod def load(filepath, sep=None, *args, **kwargs): """Loads the ordered list from the file. Defaults to expect a text file where each line contains a single nominal label. """ return load_label_set(filepath, sep, *args, **kwargs)
def __init__( self, ordered_keys, shift=0, pos_label=1, neg_label=0, sparse_output=False, ignore_dups=False, sort_keys=False, #unknown=None, #unknown_idx=None, #unknown_key='unknown_default', ): """ Parameters ---------- ordered_keys : Iterable The keys to be added to the shift : int, optional Shifts the encoding by the given value. Can be seen as the starting value of the ordered encodings. pos_label : int The positive label to use when binarizing or one hot encoding. neg_label : in The negative label to use when binarizing or one hot encoding. sparse_output : bool ??? same as scikit LabelBinarizer learn atm. ignore_dups : bool, optional Ignores any duplicates in the given ordered keys. Not implemented! sort_keys : bool, optional """ if not ignore_dups and len(set(ordered_keys)) != len(ordered_keys): raise ValueError('There are duplicates in the given sequence') if ignore_dups: raise NotImplementedError('Ignore_dups is not yet implemented.') if sort_keys: # Sort the keys so they are in the encoder sorted, rather than # order given. ordered_keys = np.unique(ordered_keys) # np.unique sorts the keys self.argsorted_keys = None # KeySortedBidict keeps the keys sorted self.encoder = KeySortedBidict({ key: enc + shift for enc, key in enumerate(ordered_keys, shift) }) else: # Use in order given, but maintain sorted_args for encoding unique, self.argsorted_keys = np.unique( ordered_keys, return_index=True, ) # TODO probably can make ignore dups and error raise more efficient # if already using unique like this. NOTE that unique here does not # work to get argsorted_keys unless ordered_keys is already unique # keys only, which is it given the check and NOT ignore_dups self.encoder = OrderedBidict({ key: enc + shift for enc, key in enumerate(ordered_keys, shift) }) self.pos_label = pos_label self.neg_label = neg_label self.sparse_output = sparse_output # TODO need to flesh out the handling of unknowns in the enecoder """
default = str(DATA_DIR / "config/colors.ini") conf_locations = [str(DIR / "colors.ini") for DIR in CONF_DIRS[::-1]] if CONF_DIR not in CONF_DIRS: conf_locations.append(str(CONF_DIR / "colors.ini")) COLORS.read([default, *conf_locations]) def save_colors(): with open(str(CONF_DIR / "colors.ini"), "w") as color_file: COLORS.write(color_file) COLORS.load = load_colors COLORS.save = save_colors LOG_CFG = {} TITLES = { "spectrum_view": OrderedBidict({ "name": "Name", "notes": "Notes" }), "peak_view": OrderedBidict({ "label": "Label", "name": " ", "shape": "Shape", "position": "Position", "area": "Area*", "fwhm": "FWHM*", "alpha": "Par1", "beta": "Par2", "gamma": "Par3" }), "static_specinfo": OrderedBidict({ "filename": "Filename" }),
import pytest from bidict import OrderedBidict, ValueDuplicationError, bidict BIDICT_TYPES = (bidict, OrderedBidict) ELEMENTS = OrderedBidict(( ('H', 'hydrogen'), ('He', 'helium'), ('Li', 'lithium'), ('Be', 'beryllium'), ('B', 'boron'), ('C', 'carbon'), ('N', 'nitrogen'), ('O', 'oxygen'), ('F', 'fluorine'), ('Ne', 'neon'), ('Na', 'sodium'), ('Mg', 'magnesium'), ('Al', 'aluminum'), ('Si', 'silicon'), ('P', 'phosphorus'), ('S', 'sulfur'), ('Cl', 'chlorine'), ('Ar', 'argon'), )) UPDATE_NODUP = OrderedBidict(( ('K', 'potassium'), ('Ca', 'calcium'), ('Sc', 'Scandium'), ('Ti', 'titanium'),
class _DictSubcls(dict): pass class _OrderedBidictSubcls(OrderedBidict): pass # pylint: disable=C0103 items = [('a', 1), ('b', 2)] # use int values so makes sense with Counter itemsreversed = list(reversed(items)) bidict_of_items = bidict(items) frozenbidict_of_items = frozenbidict(items) namedbidict_of_items = namedbidict('named', 'keys', 'vals')(items) orderedbidict_of_items = OrderedBidict(items) orderedbidict_of_itemsreversed = OrderedBidict(itemsreversed) orderedbidictsubcls_of_items = _OrderedBidictSubcls(items) orderedbidictsubcls_of_itemsreversed = _OrderedBidictSubcls(itemsreversed) frozenorderedbidict_of_items = FrozenOrderedBidict(items) frozenorderedbidict_of_itemsreversed = FrozenOrderedBidict(itemsreversed) bidicts = ( bidict_of_items, frozenbidict_of_items, namedbidict_of_items, orderedbidict_of_items, orderedbidict_of_itemsreversed, orderedbidictsubcls_of_items, orderedbidictsubcls_of_itemsreversed, frozenorderedbidict_of_items, frozenorderedbidict_of_itemsreversed,
assert mapping_inv == bi.inv assert not bi.inv != mapping_inv assert not mapping_inv != bi.inv @given(st.HBI_AND_HMAP_FROM_SAME_ND_ITEMS) def test_equal_hashables_have_same_hash(hashable_bidict_and_mapping): """Hashable bidicts and hashable mappings that are equal should hash to the same value.""" bi, mapping = hashable_bidict_and_mapping assert bi == mapping assert mapping == bi assert hash(bi) == hash(mapping) @given(st.BIDICTS, st.NON_BI_MAPPINGS) @example(OrderedBidict([(1, 1), (2, 2)]), OrderedDict([(1, 1), (2, 2)])) @example(OrderedBidict([(1, 1), (2, 2)]), OrderedDict([(2, 2), (1, 1)])) @example(OrderedBidict({None: None}), {False: None, None: None}) def test_equals_matches_equals_order_sensitive(bi, mapping): """Bidict equals_order_sensitive should agree with __eq__.""" mapping_inv = OrderedDict((v, k) for (k, v) in mapping.items()) if bi.equals_order_sensitive(mapping): assert bi == mapping assert mapping == bi assert list(bi.inv.items()) == list(mapping_inv.items()) else: assert list(bi.items()) != list(mapping.items()) if bi == mapping: assert mapping == bi assert bi.items() == mapping.items( ) # should use (unordered) set comparison