def make_index_unique(index: pd.Index, join: str = '-'): """Makes the index unique by appending '1', '2', etc. The first occurance of a non-unique value is ignored. Parameters ---------- join The connecting string between name and integer. Examples -------- >>> adata1 = sc.AnnData(np.ones((3, 2)), {'obs_names': ['a', 'b', 'c']}) >>> adata2 = sc.AnnData(np.zeros((3, 2)), {'obs_names': ['d', 'b', 'b']}) >>> adata = adata1.concatenate(adata2) >>> adata.obs_names Index(['a', 'b', 'c', 'd', 'b', 'b'], dtype='object') >>> adata.obs_names_make_unique() >>> adata.obs_names Index(['a', 'b', 'c', 'd', 'b-1', 'b-2'], dtype='object') """ if index.is_unique: return index from collections import defaultdict values = index.values indices_dup = index.duplicated(keep='first') values_dup = values[indices_dup] counter = defaultdict(lambda: 0) for i, v in enumerate(values_dup): counter[v] += 1 values_dup[i] += join + str(counter[v]) values[indices_dup] = values_dup index = pd.Index(values) return index
def make_index_unique(index: pd.Index, join: str = "-"): """ Makes the index unique by appending a number string to each duplicate index element: '1', '2', etc. If a tentative name created by the algorithm already exists in the index, it tries the next integer in the sequence. The first occurrence of a non-unique value is ignored. Parameters ---------- join The connecting string between name and integer. Examples -------- >>> from anndata import AnnData >>> adata = AnnData(np.ones((2, 3)), var=pd.DataFrame(index=["a", "a", "b"])) >>> adata.var_names Index(['a', 'a', 'b'], dtype='object') >>> adata.var_names_make_unique() >>> adata.var_names Index(['a', 'a-1', 'b'], dtype='object') """ if index.is_unique: return index from collections import Counter values = index.values.copy() indices_dup = index.duplicated(keep="first") values_dup = values[indices_dup] values_set = set(values) counter = Counter() issue_interpretation_warning = False example_colliding_values = [] for i, v in enumerate(values_dup): while True: counter[v] += 1 tentative_new_name = v + join + str(counter[v]) if tentative_new_name not in values_set: values_set.add(tentative_new_name) values_dup[i] = tentative_new_name break issue_interpretation_warning = True if len(example_colliding_values) < 5: example_colliding_values.append(tentative_new_name) if issue_interpretation_warning: warnings.warn( f"Suffix used ({join}[0-9]+) to deduplicate index values may make index " + "values difficult to interpret. There values with a similar suffixes in " + "the index. Consider using a different delimiter by passing " + "`join={delimiter}`" + "Example key collisions generated by the make_index_unique algorithm: " + str(example_colliding_values)) values[indices_dup] = values_dup index = pd.Index(values, name=index.name) return index
def assert_index(obj: pd.Index): r"""Check if index is conform to audformat.""" if isinstance(obj, pd.MultiIndex) and len(obj.levels) == 2: if obj.has_duplicates: max_display = 10 duplicates = obj[obj.duplicated()] msg_tail = '\n...' if len(duplicates) > max_display else '' msg_duplicates = '\n'.join([ str(duplicate) for duplicate in duplicates[:max_display].tolist() ]) raise ValueError('Found duplicates:\n' f'{msg_duplicates}{msg_tail}') if not (obj.names[0] == audformat.define.IndexField.START and obj.names[1] == audformat.define.IndexField.END): expected_names = [ audformat.define.IndexField.START, audformat.define.IndexField.END, ] raise ValueError('Found two levels with names ' f'{obj.names}, ' f'but expected names ' f'{expected_names}.') if not pd.api.types.is_timedelta64_dtype(obj.levels[0].dtype): raise ValueError( "Level 'start' must contain values of type 'timedelta64[ns]'.") if not pd.api.types.is_timedelta64_dtype(obj.levels[1].dtype): raise ValueError( "Level 'end' must contain values of type 'timedelta64[ns]'.") else: audformat.assert_index(obj)
def _maybe_check_integrity(self, concat_index: Index): if self.verify_integrity: if not concat_index.is_unique: overlap = concat_index[concat_index.duplicated()].unique() raise ValueError( "Indexes have overlapping values: " "{overlap!s}".format(overlap=overlap) )
def make_index_unique(index: pd.Index, join: str = "-"): """ Makes the index unique by appending a number string to each duplicate index element: '1', '2', etc. If a tentative name created by the algorithm already exists in the index, it tries the next integer in the sequence. The first occurrence of a non-unique value is ignored. Parameters ---------- join The connecting string between name and integer. Examples -------- >>> from anndata import AnnData >>> adata1 = AnnData(np.ones((3, 2)), dict(obs_names=['a', 'b', 'c'])) >>> adata2 = AnnData(np.zeros((3, 2)), dict(obs_names=['d', 'b', 'b'])) >>> adata = adata1.concatenate(adata2) >>> adata.obs_names Index(['a', 'b', 'c', 'd', 'b', 'b'], dtype='object') >>> adata.obs_names_make_unique() >>> adata.obs_names Index(['a', 'b', 'c', 'd', 'b-1', 'b-2'], dtype='object') """ if index.is_unique: return index from collections import defaultdict values = index.values values_set = set(values) indices_dup = index.duplicated(keep="first") values_dup = values[indices_dup] counter = defaultdict(lambda: 0) for i, v in enumerate(values_dup): while True: counter[v] += 1 tentative_new_name = v + join + str(counter[v]) if tentative_new_name not in values_set: values_set.add(tentative_new_name) values_dup[i] = tentative_new_name break values[indices_dup] = values_dup index = pd.Index(values) return index
def check_no_dupes(idx: pd.Index, name: str) -> bool: dupes = idx.duplicated().any() if dupes: warn(f"Duplicated {name}: {idx[idx.duplicated(False)].sort_values()}") return not dupes
def index_has_duplicates(index: Index) -> bool: """Indicates whether a DataFrame's Index contains any duplicates.""" return index.duplicated().any()
def test_index_has_no_duplicates(self, index: Index): self.assertEqual(first=index.duplicated().any(), second=False, msg='The index contains duplicate values.')