Exemplo n.º 1
0
def make_index_unique(index: pd.Index, join: str = '-'):
    """Makes the index unique by appending '1', '2', etc.

    The first occurance of a non-unique value is ignored.

    Parameters
    ----------
    join
         The connecting string between name and integer.

    Examples
    --------
    >>> adata1 = sc.AnnData(np.ones((3, 2)), {'obs_names': ['a', 'b', 'c']})
    >>> adata2 = sc.AnnData(np.zeros((3, 2)), {'obs_names': ['d', 'b', 'b']})
    >>> adata = adata1.concatenate(adata2)
    >>> adata.obs_names
    Index(['a', 'b', 'c', 'd', 'b', 'b'], dtype='object')
    >>> adata.obs_names_make_unique()
    >>> adata.obs_names
    Index(['a', 'b', 'c', 'd', 'b-1', 'b-2'], dtype='object')
    """
    if index.is_unique:
        return index
    from collections import defaultdict
    values = index.values
    indices_dup = index.duplicated(keep='first')
    values_dup = values[indices_dup]
    counter = defaultdict(lambda: 0)
    for i, v in enumerate(values_dup):
        counter[v] += 1
        values_dup[i] += join + str(counter[v])
    values[indices_dup] = values_dup
    index = pd.Index(values)
    return index
Exemplo n.º 2
0
def make_index_unique(index: pd.Index, join: str = "-"):
    """
    Makes the index unique by appending a number string to each duplicate index element:
    '1', '2', etc.

    If a tentative name created by the algorithm already exists in the index, it tries
    the next integer in the sequence.

    The first occurrence of a non-unique value is ignored.

    Parameters
    ----------
    join
         The connecting string between name and integer.

    Examples
    --------
    >>> from anndata import AnnData
    >>> adata = AnnData(np.ones((2, 3)), var=pd.DataFrame(index=["a", "a", "b"]))
    >>> adata.var_names
    Index(['a', 'a', 'b'], dtype='object')
    >>> adata.var_names_make_unique()
    >>> adata.var_names
    Index(['a', 'a-1', 'b'], dtype='object')
    """
    if index.is_unique:
        return index
    from collections import Counter

    values = index.values.copy()
    indices_dup = index.duplicated(keep="first")
    values_dup = values[indices_dup]
    values_set = set(values)
    counter = Counter()
    issue_interpretation_warning = False
    example_colliding_values = []
    for i, v in enumerate(values_dup):
        while True:
            counter[v] += 1
            tentative_new_name = v + join + str(counter[v])
            if tentative_new_name not in values_set:
                values_set.add(tentative_new_name)
                values_dup[i] = tentative_new_name
                break
            issue_interpretation_warning = True
            if len(example_colliding_values) < 5:
                example_colliding_values.append(tentative_new_name)

    if issue_interpretation_warning:
        warnings.warn(
            f"Suffix used ({join}[0-9]+) to deduplicate index values may make index "
            +
            "values difficult to interpret. There values with a similar suffixes in "
            + "the index. Consider using a different delimiter by passing " +
            "`join={delimiter}`" +
            "Example key collisions generated by the make_index_unique algorithm: "
            + str(example_colliding_values))
    values[indices_dup] = values_dup
    index = pd.Index(values, name=index.name)
    return index
Exemplo n.º 3
0
def assert_index(obj: pd.Index):
    r"""Check if index is conform to audformat."""

    if isinstance(obj, pd.MultiIndex) and len(obj.levels) == 2:

        if obj.has_duplicates:
            max_display = 10
            duplicates = obj[obj.duplicated()]
            msg_tail = '\n...' if len(duplicates) > max_display else ''
            msg_duplicates = '\n'.join([
                str(duplicate)
                for duplicate in duplicates[:max_display].tolist()
            ])
            raise ValueError('Found duplicates:\n'
                             f'{msg_duplicates}{msg_tail}')

        if not (obj.names[0] == audformat.define.IndexField.START
                and obj.names[1] == audformat.define.IndexField.END):
            expected_names = [
                audformat.define.IndexField.START,
                audformat.define.IndexField.END,
            ]
            raise ValueError('Found two levels with names '
                             f'{obj.names}, '
                             f'but expected names '
                             f'{expected_names}.')
        if not pd.api.types.is_timedelta64_dtype(obj.levels[0].dtype):
            raise ValueError(
                "Level 'start' must contain values of type 'timedelta64[ns]'.")
        if not pd.api.types.is_timedelta64_dtype(obj.levels[1].dtype):
            raise ValueError(
                "Level 'end' must contain values of type 'timedelta64[ns]'.")
    else:
        audformat.assert_index(obj)
Exemplo n.º 4
0
 def _maybe_check_integrity(self, concat_index: Index):
     if self.verify_integrity:
         if not concat_index.is_unique:
             overlap = concat_index[concat_index.duplicated()].unique()
             raise ValueError(
                 "Indexes have overlapping values: "
                 "{overlap!s}".format(overlap=overlap)
             )
Exemplo n.º 5
0
def make_index_unique(index: pd.Index, join: str = "-"):
    """
    Makes the index unique by appending a number string to each duplicate index element: '1', '2', etc.

    If a tentative name created by the algorithm already exists in the index, it tries the next integer in the sequence.

    The first occurrence of a non-unique value is ignored.
    Parameters
    ----------
    join
         The connecting string between name and integer.
    Examples
    --------
    >>> from anndata import AnnData
    >>> adata1 = AnnData(np.ones((3, 2)), dict(obs_names=['a', 'b', 'c']))
    >>> adata2 = AnnData(np.zeros((3, 2)), dict(obs_names=['d', 'b', 'b']))
    >>> adata = adata1.concatenate(adata2)
    >>> adata.obs_names
    Index(['a', 'b', 'c', 'd', 'b', 'b'], dtype='object')
    >>> adata.obs_names_make_unique()
    >>> adata.obs_names
    Index(['a', 'b', 'c', 'd', 'b-1', 'b-2'], dtype='object')
    """
    if index.is_unique:
        return index
    from collections import defaultdict

    values = index.values
    values_set = set(values)
    indices_dup = index.duplicated(keep="first")
    values_dup = values[indices_dup]
    counter = defaultdict(lambda: 0)
    for i, v in enumerate(values_dup):
        while True:
            counter[v] += 1
            tentative_new_name = v + join + str(counter[v])
            if tentative_new_name not in values_set:
                values_set.add(tentative_new_name)
                values_dup[i] = tentative_new_name
                break

    values[indices_dup] = values_dup
    index = pd.Index(values)
    return index
Exemplo n.º 6
0
def check_no_dupes(idx: pd.Index, name: str) -> bool:
    dupes = idx.duplicated().any()
    if dupes:
        warn(f"Duplicated {name}: {idx[idx.duplicated(False)].sort_values()}")
    return not dupes
Exemplo n.º 7
0
def index_has_duplicates(index: Index) -> bool:
    """Indicates whether a DataFrame's Index contains any duplicates."""

    return index.duplicated().any()
Exemplo n.º 8
0
 def test_index_has_no_duplicates(self, index: Index):
     self.assertEqual(first=index.duplicated().any(),
                      second=False,
                      msg='The index contains duplicate values.')