def test_index(self): class MetadataHaver(dict): metadata = {} @property def metadata(self): return self obj = MetadataHaver({'foo': 123}) self.assertEqual(resolve_key(obj, 'foo'), 123) obj = MetadataHaver({'foo': 123, 'bar': 'baz'}) self.assertEqual(resolve_key(obj, 'bar'), 'baz')
def from_iterable(cls, iterable, metric, key=None, keys=None): """Create DistanceMatrix from all pairs in an iterable given a metric. Parameters ---------- iterable : iterable Iterable containing objects to compute pairwise distances on. metric : callable A function that takes two arguments and returns a float representing the distance between the two arguments. key : callable or metadata key, optional A function that takes one argument and returns a string representing the id of the element in the distance matrix. Alternatively, a key to a `metadata` property if it exists for each element in the `iterable`. If None, then default ids will be used. keys : iterable, optional An iterable of the same length as `iterable`. Each element will be used as the respective key. Returns ------- DistanceMatrix The `metric` applied to all pairwise elements in the `iterable`. Raises ------ ValueError If `key` and `keys` are both provided. Notes ----- Symmetry and hollowness are assumed when calculating the distances via `metric`. Therefore, distances are only computed for the strictly upper/lower triangle. """ iterable = list(iterable) if key is not None and keys is not None: raise ValueError("Cannot use both `key` and `keys` at the same" " time.") keys_ = None if key is not None: keys_ = [resolve_key(e, key) for e in iterable] elif keys is not None: keys_ = keys dm = np.zeros((len(iterable), ) * 2) for i, a in enumerate(iterable): for j, b in enumerate(iterable[:i]): dm[i, j] = dm[j, i] = metric(a, b) return cls(dm, keys_)
def from_iterable(cls, iterable, metric, key=None, keys=None): """Create DistanceMatrix from all pairs in an iterable given a metric. Parameters ---------- iterable : iterable Iterable containing objects to compute pairwise distances on. metric : callable A function that takes two arguments and returns a float representing the distance between the two arguments. key : callable or metadata key, optional A function that takes one argument and returns a string representing the id of the element in the distance matrix. Alternatively, a key to a `metadata` property if it exists for each element in the `iterable`. If None, then default ids will be used. keys : iterable, optional An iterable of the same length as `iterable`. Each element will be used as the respective key. Returns ------- DistanceMatrix The `metric` applied to all pairwise elements in the `iterable`. Raises ------ ValueError If `key` and `keys` are both provided. Notes ----- Symmetry and hollowness are assumed when calculating the distances via `metric`. Therefore, distances are only computed for the strictly upper/lower triangle. """ iterable = list(iterable) if key is not None and keys is not None: raise ValueError("Cannot use both `key` and `keys` at the same" " time.") keys_ = None if key is not None: keys_ = [resolve_key(e, key) for e in iterable] elif keys is not None: keys_ = keys dm = np.zeros((len(iterable),) * 2) for i, a in enumerate(iterable): for j, b in enumerate(iterable[:i]): dm[i, j] = dm[j, i] = metric(a, b) return cls(dm, keys_)
def from_iterable(cls, iterable, metric, key=None, keys=None): """Create DissimilarityMatrix from an iterable given a metric. Parameters ---------- iterable : iterable Iterable containing objects to compute pairwise dissimilarities on. metric : callable A function that takes two arguments and returns a float representing the dissimilarity between the two arguments. key : callable or metadata key, optional A function that takes one argument and returns a string representing the id of the element in the dissimilarity matrix. Alternatively, a key to a `metadata` property if it exists for each element in the `iterable`. If None, then default ids will be used. keys : iterable, optional An iterable of the same length as `iterable`. Each element will be used as the respective key. Returns ------- DissimilarityMatrix The `metric` applied to all pairwise elements in the `iterable`. Raises ------ ValueError If `key` and `keys` are both provided. """ iterable = list(iterable) if key is not None and keys is not None: raise ValueError("Cannot use both `key` and `keys` at the same" " time.") keys_ = None if key is not None: keys_ = [resolve_key(e, key) for e in iterable] elif keys is not None: keys_ = keys dm = np.empty((len(iterable),) * 2) for i, a in enumerate(iterable): for j, b in enumerate(iterable): dm[i, j] = metric(a, b) return cls(dm, keys_)
def from_iterable(cls, iterable, metric, key=None, keys=None, validate=True): """Create DistanceMatrix from all pairs in an iterable given a metric. Parameters ---------- iterable : iterable Iterable containing objects to compute pairwise distances on. metric : callable A function that takes two arguments and returns a float representing the distance between the two arguments. key : callable or metadata key, optional A function that takes one argument and returns a string representing the id of the element in the distance matrix. Alternatively, a key to a `metadata` property if it exists for each element in the `iterable`. If None, then default ids will be used. keys : iterable, optional An iterable of the same length as `iterable`. Each element will be used as the respective key. validate : boolean, optional If ``True``, all pairwise distances are computed, including upper and lower triangles and the diagonal, and the resulting matrix is validated for symmetry and hollowness. If ``False``, `metric` is assumed to be hollow and symmetric and only the lower triangle (excluding the diagonal) is computed. Pass ``validate=False`` if you are sure `metric` is hollow and symmetric for improved performance. Returns ------- DistanceMatrix The `metric` applied to pairwise elements in the `iterable`. Raises ------ ValueError If `key` and `keys` are both provided. """ if validate: return super(DistanceMatrix, cls).from_iterable(iterable, metric, key, keys) iterable = list(iterable) if key is not None and keys is not None: raise ValueError("Cannot use both `key` and `keys` at the same" " time.") keys_ = None if key is not None: keys_ = [resolve_key(e, key) for e in iterable] elif keys is not None: keys_ = keys dm = np.zeros((len(iterable),) * 2) for i, a in enumerate(iterable): for j, b in enumerate(iterable[:i]): dm[i, j] = dm[j, i] = metric(a, b) return cls(dm, keys_)
def sort(self, key=None, reverse=False): """Sort sequences in-place. Performs a stable sort of the sequences in-place. Parameters ---------- key : callable or metadata key, optional If provided, defines a key to sort each sequence on. Can either be a callable accepting a single argument (each sequence) or a key into each sequence's ``metadata`` attribute. If not provided, sequences will be sorted using existing keys on the ``TabularMSA``. reverse: bool, optional If ``True``, sort in reverse order. Raises ------ OperationError If `key` is not provided and keys do not exist on the MSA. See Also -------- keys has_keys reindex Notes ----- This method's API is similar to Python's built-in sorting functionality (e.g., ``list.sort()``, ``sorted()``). See [1]_ for an excellent tutorial on sorting in Python. References ---------- .. [1] https://docs.python.org/3/howto/sorting.html Examples -------- Create a ``TabularMSA`` object without keys: >>> from skbio import DNA, TabularMSA >>> seqs = [DNA('ACG', metadata={'id': 'c'}), ... DNA('AC-', metadata={'id': 'b'}), ... DNA('AC-', metadata={'id': 'a'})] >>> msa = TabularMSA(seqs) Sort the sequences in alphabetical order by sequence identifier: >>> msa.sort(key='id') >>> msa == TabularMSA([DNA('AC-', metadata={'id': 'a'}), ... DNA('AC-', metadata={'id': 'b'}), ... DNA('ACG', metadata={'id': 'c'})]) True Note that since the sort is in-place, the ``TabularMSA`` object is modified (a new object is **not** returned). Create a ``TabularMSA`` object with keys: >>> seqs = [DNA('ACG'), DNA('AC-'), DNA('AC-')] >>> msa = TabularMSA(seqs, keys=['c', 'b', 'a']) Sort the sequences using the MSA's existing keys: >>> msa.sort() >>> msa == TabularMSA([DNA('AC-'), DNA('AC-'), DNA('ACG')], ... keys=['a', 'b', 'c']) True """ if key is None: sort_keys = self.keys.tolist() else: sort_keys = [resolve_key(seq, key) for seq in self._seqs] if len(self) > 0: if self.has_keys(): _, sorted_seqs, sorted_keys = self._sort_by_first_element( [sort_keys, self._seqs, self.keys.tolist()], reverse) self.keys = sorted_keys else: _, sorted_seqs = self._sort_by_first_element( [sort_keys, self._seqs], reverse) self._seqs = list(sorted_seqs)
def reindex(self, key=None, keys=None): """Reassign keys to sequences in the MSA. Parameters ---------- key : callable or metadata key, optional If provided, defines a unique, hashable key for each sequence in the MSA. Can either be a callable accepting a single argument (each sequence) or a key into each sequence's ``metadata`` attribute. keys : iterable, optional An iterable of the same length as the number of sequences in the MSA. `keys` must contain unique, hashable elements. Each element will be used as the respective key for the sequences in the MSA. Raises ------ ValueError If `key` and `keys` are both provided. ValueError If `keys` is not the same length as the number of sequences in the MSA. UniqueError If keys are not unique. See Also -------- keys has_keys Notes ----- If `key` or `keys` are not provided, keys will not be set and certain operations requiring keys will raise an ``OperationError``. Examples -------- Create a ``TabularMSA`` object without keys: >>> from skbio import DNA, TabularMSA >>> seqs = [DNA('ACG', metadata={'id': 'a'}), ... DNA('AC-', metadata={'id': 'b'})] >>> msa = TabularMSA(seqs) >>> msa.has_keys() False Set keys on the MSA, using each sequence's ID: >>> msa.reindex(key='id') >>> msa.has_keys() True >>> msa.keys array(['a', 'b'], dtype=object) Remove keys from the MSA: >>> msa.reindex() >>> msa.has_keys() False Alternatively, an iterable of keys may be passed via `keys`: >>> msa.reindex(keys=['a', 'b']) >>> msa.keys array(['a', 'b'], dtype=object) """ if key is not None and keys is not None: raise ValueError( "Cannot use both `key` and `keys` at the same time.") keys_ = None if key is not None: keys_ = [resolve_key(seq, key) for seq in self._seqs] elif keys is not None: keys = list(keys) if len(keys) != len(self): raise ValueError( "Number of elements in `keys` must match number of " "sequences: %d != %d" % (len(keys), len(self))) keys_ = keys if keys_ is not None: # Hashability of keys is implicitly checked here. duplicates = find_duplicates(keys_) if duplicates: raise UniqueError("Keys must be unique. Duplicate keys: %r" % duplicates) # Create an immutable ndarray to ensure key invariants are # preserved. Use object dtype to preserve original key types. This # is important, for example, because np.array(['a', 42]) will # upcast to ['a', '42']. keys_ = np.array(keys_, dtype=object, copy=True) keys_.flags.writeable = False self._keys = keys_
def test_wrong_type(self): with self.assertRaises(TypeError): resolve_key({'foo': 1}, 'foo')
def test_callable(self): def func(x): return str(x) self.assertEqual(resolve_key(1, func), "1") self.assertEqual(resolve_key(4, func), "4")
def extend(self, sequences, minter=None, index=None): """Extend this MSA with sequences without recomputing alignment. Parameters ---------- sequences : iterable of alphabet-aware scikit-bio sequence objects Sequences to be appended. Must match the dtype of the MSA and the number of positions in the MSA. minter : callable or metadata key, optional Used to create index labels for the sequences being appended. If callable, it generates a label directly. Otherwise it's treated as a key into the sequence metadata. Note that `minter` cannot be combined with `index`. index : pd.Index consumable, optional Index labels to use for the appended sequences. Must be the same length as `sequences`. Must be able to be passed directly to ``pd.Index`` constructor. Note that `index` cannot be combined with `minter`. Raises ------ ValueError If both `minter` and `index` are both provided. ValueError If neither `minter` nor `index` are provided and the MSA has a non-default index. ValueError If `index` is not the same length as `sequences`. TypeError If `sequences` contains a type that does not have an alphabet. TypeError If `sequence` contains a type that does not match the dtype of the MSA. ValueError If the length of a sequence does not match the number of positions in the MSA. See Also -------- append reassign_index Notes ----- If neither `minter` nor `index` are provided and this MSA has default index labels, the new index labels will be auto-incremented. The MSA is not automatically re-aligned when appending sequences. Therefore, this operation is not necessarily meaningful on its own. Examples -------- >>> from skbio import DNA, TabularMSA >>> msa = TabularMSA([DNA('ACGT')]) >>> msa.extend([DNA('AG-T'), DNA('-G-T')]) >>> msa == TabularMSA([DNA('ACGT'), DNA('AG-T'), DNA('-G-T')]) True Auto-incrementing index labels: >>> msa.index Int64Index([0, 1, 2], dtype='int64') >>> msa.extend([DNA('ACGA'), DNA('AC-T'), DNA('----')]) >>> msa.index Int64Index([0, 1, 2, 3, 4, 5], dtype='int64') """ if minter is not None and index is not None: raise ValueError( "Cannot use both `minter` and `index` at the same time.") sequences = list(sequences) if minter is None and index is None: if self.index.equals(pd.Index(np.arange(len(self)))): index = range(len(self), len(self) + len(sequences)) else: raise ValueError( "MSA does not have default index labels, must provide " "a `minter` or `index` for sequence(s).") elif minter is not None: index = [resolve_key(seq, minter) for seq in sequences] # Cast to Index to identify tuples as a MultiIndex to match # pandas constructor. Just setting would make an index of tuples. if not isinstance(index, pd.Index): index = pd.Index(index) self._assert_valid_sequences(sequences) # pandas doesn't give a user-friendly error message if we pass through. if len(sequences) != len(index): raise ValueError( "Number of sequences (%d) must match index length (%d)" % (len(sequences), len(index))) self._seqs = self._seqs.append(pd.Series(sequences, index=index))
def append(self, sequence, minter=None, label=None): """Append a sequence to the MSA without recomputing alignment. Parameters ---------- sequence : alphabet-aware scikit-bio sequence object Sequence to be appended. Must match the dtype of the MSA and the number of positions in the MSA. minter : callable or metadata key, optional Used to create a label for the sequence being appended. If callable, it generates a label directly. Otherwise it's treated as a key into the sequence metadata. Note that `minter` cannot be combined with `label`. label : object, optional Index label to use for the appended sequence. Note that `label` cannot be combined with `minter`. Raises ------ ValueError If both `minter` and `label` are provided. ValueError If neither `minter` nor `label` are provided and the MSA has a non-default index. TypeError If the sequence object is a type that doesn't have an alphabet. TypeError If the type of the sequence does not match the dtype of the MSA. ValueError If the length of the sequence does not match the number of positions in the MSA. See Also -------- reassign_index Notes ----- If neither `minter` nor `label` are provided and this MSA has default index labels, the new label will be auto-incremented. The MSA is not automatically re-aligned when a sequence is appended. Therefore, this operation is not necessarily meaningful on its own. Examples -------- >>> from skbio import DNA, TabularMSA >>> msa = TabularMSA([DNA('ACGT')]) >>> msa.append(DNA('AG-T')) >>> msa == TabularMSA([DNA('ACGT'), DNA('AG-T')]) True Auto-incrementing index labels: >>> msa.index Int64Index([0, 1], dtype='int64') >>> msa.append(DNA('ACGA')) >>> msa.index Int64Index([0, 1, 2], dtype='int64') """ if minter is not None and label is not None: raise ValueError( "Cannot use both `minter` and `label` at the same time.") if minter is None and label is None: if self.index.equals(pd.Index(np.arange(len(self)))): label = len(self) else: raise ValueError( "Must provide a `minter` or `label` for this sequence.") if minter is not None: label = resolve_key(sequence, minter) self._assert_valid_sequence(sequence) self._seqs = self._seqs.append(pd.Series([sequence], index=[label]))
def reassign_index(self, mapping=None, minter=None): """Reassign index labels to sequences in this MSA. Parameters ---------- mapping : dict-like or callable, optional Dictionary or callable that maps existing labels to new labels. Any label without a mapping will remain the same. minter : callable or metadata key, optional If provided, defines an index label for each sequence. Can either be a callable accepting a single argument (each sequence) or a key into each sequence's ``metadata`` attribute. Raises ------ ValueError If `mapping` and `minter` are both provided. See Also -------- index Notes ----- If neither `mapping` nor `minter` are provided, default pandas labels will be used: integer labels ``0..(N-1)``, where ``N`` is the number of sequences. Examples -------- Create a ``TabularMSA`` object with default index labels: >>> from skbio import DNA, TabularMSA >>> seqs = [DNA('ACG', metadata={'id': 'a'}), ... DNA('AC-', metadata={'id': 'b'})] >>> msa = TabularMSA(seqs) >>> msa.index Int64Index([0, 1], dtype='int64') Assign new index to the MSA using each sequence's ID as a label: >>> msa.reassign_index(minter='id') >>> msa.index Index(['a', 'b'], dtype='object') Assign default index: >>> msa.reassign_index() >>> msa.index Int64Index([0, 1], dtype='int64') Alternatively, a mapping of existing labels to new labels may be passed via `mapping`: >>> msa.reassign_index(mapping={0: 'seq1', 1: 'seq2'}) >>> msa.index Index(['seq1', 'seq2'], dtype='object') """ if mapping is not None and minter is not None: raise ValueError( "Cannot use both `mapping` and `minter` at the same time.") if mapping is not None: self._seqs.rename(mapping, inplace=True) elif minter is not None: index = [resolve_key(seq, minter) for seq in self._seqs] # Cast to Index to identify tuples as a MultiIndex to match # pandas constructor. Just setting would make an index of tuples. self.index = pd.Index(index) else: self._seqs.reset_index(drop=True, inplace=True)
def reindex(self, key=None, keys=None): """Reassign keys to sequences in the MSA. Parameters ---------- key : callable or metadata key, optional If provided, defines a unique, hashable key for each sequence in the MSA. Can either be a callable accepting a single argument (each sequence) or a key into each sequence's ``metadata`` attribute. keys : iterable, optional An iterable of the same length as the number of sequences in the MSA. `keys` must contain unique, hashable elements. Each element will be used as the respective key for the sequences in the MSA. Raises ------ ValueError If `key` and `keys` are both provided. ValueError If `keys` is not the same length as the number of sequences in the MSA. UniqueError If keys are not unique. See Also -------- keys has_keys Notes ----- If `key` or `keys` are not provided, keys will not be set and certain operations requiring keys will raise an ``OperationError``. Examples -------- Create a ``TabularMSA`` object without keys: >>> from skbio import DNA, TabularMSA >>> seqs = [DNA('ACG', metadata={'id': 'a'}), ... DNA('AC-', metadata={'id': 'b'})] >>> msa = TabularMSA(seqs) >>> msa.has_keys() False Set keys on the MSA, using each sequence's ID: >>> msa.reindex(key='id') >>> msa.has_keys() True >>> msa.keys array(['a', 'b'], dtype=object) Remove keys from the MSA: >>> msa.reindex() >>> msa.has_keys() False Alternatively, an iterable of keys may be passed via `keys`: >>> msa.reindex(keys=['a', 'b']) >>> msa.keys array(['a', 'b'], dtype=object) """ if key is not None and keys is not None: raise ValueError( "Cannot use both `key` and `keys` at the same time.") keys_ = None if key is not None: keys_ = [resolve_key(seq, key) for seq in self._seqs] elif keys is not None: keys = list(keys) if len(keys) != len(self): raise ValueError( "Number of elements in `keys` must match number of " "sequences: %d != %d" % (len(keys), len(self))) keys_ = keys if keys_ is not None: # Hashability of keys is implicitly checked here. duplicates = find_duplicates(keys_) if duplicates: raise UniqueError( "Keys must be unique. Duplicate keys: %r" % duplicates) # Create an immutable ndarray to ensure key invariants are # preserved. Use object dtype to preserve original key types. This # is important, for example, because np.array(['a', 42]) will # upcast to ['a', '42']. keys_ = np.array(keys_, dtype=object, copy=True) keys_.flags.writeable = False self._keys = keys_