Пример #1
0
    def __init__(self, uri=None, modality=None):
        super(Annotation, self).__init__()

        self._uri = uri
        self.modality = modality

        # sorted dictionary
        # keys: annotated segments
        # values: {track: label} dictionary
        self._tracks = SortedDict(key_type=(float, float),
                                  updator=TimelineUpdator)

        # dictionary
        # key: label
        # value: timeline
        self._labels = {}
        self._labelNeedsUpdate = {}

        # timeline meant to store all annotated segments
        self._timeline = Timeline(uri=uri)
        self._timelineNeedsUpdate = True
Пример #2
0
    def __init__(self, uri=None, modality=None):
        super(Annotation, self).__init__()

        self._uri = uri
        self.modality = modality

        # sorted dictionary
        # keys: annotated segments
        # values: {track: label} dictionary
        self._tracks = SortedDict(key_type=(float, float),
                                  updator=TimelineUpdator)

        # dictionary
        # key: label
        # value: timeline
        self._labels = {}
        self._labelNeedsUpdate = {}

        # timeline meant to store all annotated segments
        self._timeline = Timeline(uri=uri)
        self._timelineNeedsUpdate = True
Пример #3
0
    def copy(self):

        # create new empty annotation
        copied = self.__class__(uri=self.uri, modality=self.modality)

        # deep copy internal track dictionary
        _tracks = [(key, dict(value)) for (key, value) in self._tracks.items()]
        copied._tracks = SortedDict(items=_tracks,
                                    key_type=(float, float),
                                    updator=TimelineUpdator)

        # deep copy internal label timelines
        _labels = {key: timeline.copy()
                   for (key, timeline) in self._labels.iteritems()}
        copied._labels = _labels

        # deep copy need-update indicator
        copied._labelNeedsUpdate = dict(self._labelNeedsUpdate)

        copied._timelineNeedsUpdate = self._timelineNeedsUpdate

        return copied
Пример #4
0
class Annotation(object):
    """Annotation

    Parameters
    ----------
    uri : string, optional
        uniform resource identifier of annotated document
    modality : string, optional
        name of annotated modality

    """

    @classmethod
    def from_df(cls, df, uri=None, modality=None, aggfunc=np.mean):
        """

        Parameters
        ----------
        df : DataFrame
            Must contain the following columns: 'segment', 'track' and 'label'
        uri : str, optional
            Resource identifier
        modality : str, optional
            Modality
        aggfunc : func
            Value aggregation function in case of duplicate (segment, track,
            label) tuples

        Returns
        -------

        """
        annotation = cls(uri=uri, modality=modality)
        for _, (segment, track, label) in df[[SEGMENT, TRACK, LABEL]].iterrows():
            annotation[segment, track] = label
        return annotation

    def __init__(self, uri=None, modality=None):
        super(Annotation, self).__init__()

        self._uri = uri
        self.modality = modality

        # sorted dictionary
        # keys: annotated segments
        # values: {track: label} dictionary
        self._tracks = SortedDict(key_type=(float, float),
                                  updator=TimelineUpdator)

        # dictionary
        # key: label
        # value: timeline
        self._labels = {}
        self._labelNeedsUpdate = {}

        # timeline meant to store all annotated segments
        self._timeline = Timeline(uri=uri)
        self._timelineNeedsUpdate = True

    def _get_uri(self):
        return self._uri

    def _set_uri(self, uri):
        # update uri for all internal timelines
        for _, timeline in self._labels.iteritems():
            timeline.uri = uri
        self._uri = uri

    uri = property(_get_uri, fset=_set_uri, doc="Resource identifier")

    def _updateLabels(self):

        # (re-)initialize changed label timeline
        for l, needsUpdate in self._labelNeedsUpdate.iteritems():
            if needsUpdate:
                self._labels[l] = Timeline(uri=self.uri)

        # fill changed label timeline
        for segment, track, l in self.itertracks(label=True):
            if self._labelNeedsUpdate[l]:
                self._labels[l].add(segment)

        self._labelNeedsUpdate = {l: False for l in self._labels}

        # remove "ghost" labels (i.e. label with empty timeline)
        labels = self._labels.keys()
        for l in labels:
            if not self._labels[l]:
                self._labels.pop(l)
                self._labelNeedsUpdate.pop(l)

    def __len__(self):
        """Number of segments"""
        return self._tracks.length()

    def __nonzero__(self):
        return self._tracks.length() > 0

    def itersegments(self):
        """Segment iterator"""
        return iter(self._tracks)

    def itertracks(self, label=False):
        for segment, tracks in self._tracks.items():
            for track, lbl in tracks.iteritems():
                if label:
                    yield segment, track, lbl
                else:
                    yield segment, track

    def _updateTimeline(self):
        self._timeline = Timeline(segments=self._tracks, uri=self.uri)
        self._timelineNeedsUpdate = False

    def get_timeline(self):
        """Get timeline made of annotated segments"""
        if self._timelineNeedsUpdate:
            self._updateTimeline()
        return self._timeline

    def __eq__(self, other):
        return self._tracks == other._tracks

    def __ne__(self, other):
        return self._tracks != other._tracks

    def __contains__(self, included):
        """Inclusion

        Use expression 'segment in annotation' or 'timeline in annotation'

        Parameters
        ----------
        included : `Segment` or `Timeline`

        Returns
        -------
        contains : bool
            True if every segment in `included` exists in annotation
            False otherwise

        """
        return included in self.get_timeline()

    def crop(self, other, mode='intersection'):
        """Crop annotation

        Parameters
        ----------
        other : `Segment` or `Timeline`

        mode : {'strict', 'loose', 'intersection'}
            In 'strict' mode, only segments fully included in focus coverage
            are kept. In 'loose' mode, any intersecting segment is kept
            unchanged. In 'intersection' mode, only intersecting segments are
            kept and replaced by their actual intersection with the focus.

        Returns
        -------
        cropped : Annotation

        Remarks
        -------
        In 'intersection' mode, the best is done to keep the track names
        unchanged. However, in some cases where two original segments are
        cropped into the same resulting segments, conflicting track names are
        modified to make sure no track is lost.
        """

        if isinstance(other, Segment):
            other = Timeline(segments=[other], uri=self.uri)
            cropped = self.crop(other, mode=mode)

        elif isinstance(other, Timeline):

            cropped = self.__class__(uri=self.uri, modality=self.modality)

            if mode == 'loose':
                # TODO
                # update co_iter to yield (segment, tracks), (segment, tracks)
                # instead of segment, segment
                # This would avoid calling ._tracks.get(segment)
                for segment, _ in self.get_timeline().co_iter(other):
                    for track, label in self._tracks[segment].iteritems():
                        cropped[segment, track] = label

            elif mode == 'strict':
                # TODO
                # see above
                for segment, other_segment in self.get_timeline().co_iter(other):
                    if segment in other_segment:
                        for track, label in self._tracks[segment].iteritems():
                            cropped[segment, track] = label

            elif mode == 'intersection':
                # see above
                for segment, other_segment in self.get_timeline().co_iter(other):
                    intersection = segment & other_segment
                    for track, label in self._tracks[segment].iteritems():
                        track = cropped.new_track(intersection,
                                                  candidate=track)
                        cropped[intersection, track] = label

            else:
                raise NotImplementedError("unsupported mode: '%s'" % mode)

        return cropped

    def get_tracks(self, segment):
        """Set of tracks for query segment

        Parameters
        ----------
        segment : `Segment`
            Query segment

        Returns
        -------
        tracks : set
            Set of tracks for query segment
        """
        return set(self._tracks.get(segment, {}))

    def has_track(self, segment, track):
        """Check whether a given track exists

        Parameters
        ----------
        segment : `Segment`
            Query segment
        track :
            Query track

        Returns
        -------
        exists : bool
            True if track exists for segment
        """
        return track in self._tracks.get(segment, {})

    def get_track_by_name(self, track):
        """Get all tracks with given name

        Parameters
        ----------
        track : any valid track name
            Requested name track

        Returns
        -------
        tracks : list
            List of (segment, track) tuples
        """
        raise NotImplementedError('')

    def copy(self):

        # create new empty annotation
        copied = self.__class__(uri=self.uri, modality=self.modality)

        # deep copy internal track dictionary
        _tracks = [(key, dict(value)) for (key, value) in self._tracks.items()]
        copied._tracks = SortedDict(items=_tracks,
                                    key_type=(float, float),
                                    updator=TimelineUpdator)

        # deep copy internal label timelines
        _labels = {key: timeline.copy()
                   for (key, timeline) in self._labels.iteritems()}
        copied._labels = _labels

        # deep copy need-update indicator
        copied._labelNeedsUpdate = dict(self._labelNeedsUpdate)

        copied._timelineNeedsUpdate = self._timelineNeedsUpdate

        return copied

    def retrack(self):
        """
        """
        retracked = self.__class__(uri=self.uri, modality=self.modality)
        for n, (s, _, label) in enumerate(self.itertracks(label=True)):
            retracked[s, n] = label
        return retracked

    def new_track(self, segment, candidate=None, prefix=None):
        """Track name generator

        Parameters
        ----------
        segment : Segment
        prefix : str, optional
        candidate : any valid track name

        Returns
        -------
        track : str
            New track name
        """

        # obtain list of existing tracks for segment
        existing_tracks = set(self._tracks.get(segment, {}))

        # if candidate is provided, check whether it already exists
        # in case it does not, use it
        if (candidate is not None) and (candidate not in existing_tracks):
            return candidate

        # no candidate was provided or the provided candidate already exists
        # we need to create a brand new one

        # by default (if prefix is not provided)
        # use modality as prefix (eg. speaker1, speaker2, ...)
        if prefix is None:
            prefix = '' if self.modality is None else str(self.modality)

        # find first non-existing track name for segment
        # eg. if speaker1 exists, try speaker2, then speaker3, ...
        count = 1
        while ('%s%d' % (prefix, count)) in existing_tracks:
            count += 1

        # return first non-existing track name
        return '%s%d' % (prefix, count)

    def __str__(self):
        """Human-friendly representation"""
        # TODO: use pandas.DataFrame
        return "\n".join(["%s %s %s" % (s, t, l)
                          for s, t, l in self.itertracks(label=True)])

    def __delitem__(self, key):

        # del annotation[segment]
        if isinstance(key, Segment):

            # Pop segment out of dictionary
            # and get corresponding tracks
            # Raises KeyError if segment does not exist
            tracks = self._tracks.pop(key)

            # mark timeline as modified
            self._timelineNeedsUpdate = True

            # mark every label in tracks as modified
            for track, label in tracks.iteritems():
                self._labelNeedsUpdate[label] = True

        # del annotation[segment, track]
        elif isinstance(key, tuple) and len(key) == 2:

            # get segment tracks as dictionary
            # if segment does not exist, get empty dictionary
            # Raises KeyError if segment does not exist
            tracks = self._tracks[key[0]]

            # pop track out of tracks dictionary
            # and get corresponding label
            # Raises KeyError if track does not exist
            label = tracks.pop(key[1])

            # mark label as modified
            self._labelNeedsUpdate[label] = True

            # if tracks dictionary is now empty,
            # remove segment as well
            if not tracks:
                self._tracks.pop(key[0])
                self._timelineNeedsUpdate = True

        else:
            raise KeyError('')

    # label = annotation[segment, track]
    def __getitem__(self, key):

        if isinstance(key, Segment):
            key = (key, '_')

        return self._tracks[key[0]][key[1]]

    # annotation[segment, track] = label
    def __setitem__(self, key, label):

        if isinstance(key, Segment):
            key = (key, '_')

        if key[0] not in self._tracks:
            self._tracks[key[0]] = {}
            self._timelineNeedsUpdate = True

        self._tracks[key[0]][key[1]] = label
        self._labelNeedsUpdate[label] = True

    def empty(self):
        return self.__class__(uri=self.uri, modality=self.modality)

    def labels(self, unknown=True):
        """List of labels

        Parameters
        ----------
        unknown : bool, optional
            When False, do not return Unknown instances
            When True, return any label (even Unknown instances)

        Returns
        -------
        labels : list
            Sorted list of labels

        Remarks
        -------
            Labels are sorted based on their string representation.
        """

        if any([lnu for lnu in self._labelNeedsUpdate.values()]):
            self._updateLabels()

        labels = sorted(self._labels, key=str)

        if not unknown:
            labels = [l for l in labels if not isinstance(l, Unknown)]

        return labels

    def get_labels(self, segment, unknown=True, unique=True):
        """Local set of labels

        Parameters
        ----------
        segment : Segment
            Segments to get label from.
        unknown : bool, optional
            When False, do not return Unknown instances
            When True, return any label (even Unknown instances)
        unique : bool, optional
            When False, return the list of (possibly repeated) labels.
            When True (default), return the set of labels
        Returns
        -------
        labels : set
            Set of labels for `segment` if it exists, empty set otherwise.

        Examples
        --------

            >>> annotation = Annotation()
            >>> segment = Segment(0, 2)
            >>> annotation[segment, 'speaker1'] = 'Bernard'
            >>> annotation[segment, 'speaker2'] = 'John'
            >>> print sorted(annotation.get_labels(segment))
            set(['Bernard', 'John'])
            >>> print annotation.get_labels(Segment(1, 2))
            set([])

        """

        labels = self._tracks.get(segment, {}).values()

        if not unknown:
            labels = [l for l in labels if not isinstance(l, Unknown)]

        if unique:
            labels = set(labels)

        return labels

    def subset(self, labels, invert=False):
        """Annotation subset

        Extract annotation subset based on labels

        Parameters
        ----------
        labels : set
            Set of labels
        invert : bool, optional
            If invert is True, extract all but requested `labels`

        Returns
        -------
        subset : `Annotation`
            Annotation subset.
        """

        if not isinstance(labels, set):
            raise TypeError('labels must be provided as a set of labels.')

        if invert:
            labels = set(self.labels()) - labels
        else:
            labels = labels & set(self.labels())

        sub = self.__class__(uri=self.uri, modality=self.modality)
        for segment, track, label in self.itertracks(label=True):
            if label in labels:
                sub[segment, track] = label

        return sub

    def label_timeline(self, label):
        """Get timeline for a given label

        Parameters
        ----------
        label :

        Returns
        -------
        timeline : :class:`Timeline`
            Timeline made of all segments annotated with `label`

        """
        if label not in self.labels():
            return Timeline(uri=self.uri)

        if self._labelNeedsUpdate[label]:
            self._updateLabels()

            for l, hasChanged in self._labelNeedsUpdate.iteritems():
                if hasChanged:
                    self._labels[l] = Timeline(uri=self.uri)

            for segment, track, l in self.itertracks(label=True):
                if self._labelNeedsUpdate[l]:
                    self._labels[l].add(segment)

            self._labelNeedsUpdate = {l: False for l in self._labels}

        return self._labels[label]

    def label_coverage(self, label):
        """

        Parameters
        ----------
        label :

        Returns
        -------

        """
        if label not in self.labels():
            return Timeline(uri=self.uri)

        return self.label_timeline(label).coverage()

    def label_duration(self, label):

        if label not in self.labels():
            return 0.

        return self.label_timeline(label).duration()

    def chart(self, percent=False):
        """
        Label chart based on their duration

        Parameters
        ----------
        percent : bool, optional
            Return total duration percentage (rather than raw duration)

        Returns
        -------
        chart : (label, duration) iterable
            Sorted from longest to shortest.

        """

        chart = sorted([(label, self.label_duration(label))
                        for label in self.labels()],
                       key=lambda x: x[1], reverse=True)

        if percent:
            total = np.sum([duration for _, duration in chart])
            chart = [(label, duration/total) for (label, duration) in chart]

        return chart

    def argmax(self, segment=None, known_first=False):
        """Get most frequent label


        Parameters
        ----------
        segment : Segment, optional
            Section of annotation where to look for the most frequent label.
            Defaults to whole annotation extent.
        known_first: bool, optional
            If True, artificially reduces the duration of intersection of
            `Unknown` labels so that 'known' labels are returned first.

        Returns
        -------
        label : any existing label or None
            Label with longest intersection

        Examples
        --------

            >>> annotation = Annotation(modality='speaker')
            >>> annotation[Segment(0, 10), 'speaker1'] = 'Alice'
            >>> annotation[Segment(8, 20), 'speaker1'] = 'Bob'
            >>> print "%s is such a talker!" % annotation.argmax()
            Bob is such a talker!
            >>> segment = Segment(22, 23)
            >>> if not annotation.argmax(segment):
            ...    print "No label intersecting %s" % segment
            No label intersection [22 --> 23]

        """

        # if annotation is empty, obviously there is no most frequent label
        if not self:
            return None

        # if segment is not provided, just look for the overall most frequent
        # label (ie. set segment to the extent of the annotation)
        if segment is None:
            segment = self.get_timeline().extent()

        # compute intersection duration for each label
        durations = {lbl: self.label_timeline(lbl).crop(segment, mode='intersection').duration()
                     for lbl in self.labels()}

        # artifically reduce intersection duration of Unknown labels
        # so that 'known' labels are returned first
        if known_first:
            maxduration = max(durations.values())
            for lbl in durations.keys():
                if isinstance(lbl, Unknown):
                    durations[lbl] = durations[lbl] - maxduration

        # find the most frequent label
        label = max(durations.iteritems(), key=operator.itemgetter(1))[0]

        # in case all durations were zero, there is no most frequent label
        return label if durations[label] > 0 else None

    def __rshift__(self, timeline):
        """Tag a timeline

        Use expression 'tagged = annotation >> timeline'

        Shortcut for :
            >>> tagger = DirectTagger()
            >>> tagged = tagger(annotation, timeline)

        Parameters
        ----------
        timeline : :class:`pyannote.base.timeline.Timeline`

        Returns
        -------
        tagged : :class:`pyannote.base.annotation.Annotation`
            Tagged timeline - one track per intersecting label.

        """
        from pyannote.algorithm.tagging import DirectTagger
        if not isinstance(timeline, Timeline):
            raise TypeError('direct tagging (>>) only works with timelines.')
        return DirectTagger()(self, timeline)

    def translate(self, translation):
        """Translate labels

        Parameters
        ----------
        translation: dict
            Label translation.
            Labels with no associated translation are kept unchanged.

        Returns
        -------
        translated : :class:`Annotation`
            New annotation with translated labels.
        """

        assert isinstance(translation, dict)

        # create an empty copy
        translated = self.empty()

        for segment, track, label in self.itertracks(label=True):
            # only transform labels that have an actual translation
            # in the provided dictionary, keep the others as they are.
            translated[segment, track] = translation.get(label, label)

        return translated

    def __mod__(self, translation):
        return self.translate(translation)

    def anonymize_labels(self):
        """Anonmyize labels

        Create a new annotation where labels are anonymized, ie. each label
        is replaced by a unique `Unknown` instance.

        Returns
        -------
        anonymized : :class:`Annotation`
            New annotation with anonymized labels.

        """
        translation = {label: Unknown() for label in self.labels()}
        return self % translation

    def anonymize_tracks(self):
        """
        Anonymize tracks

        Create a new annotation where each track is anonymized, i.e. the label
        of each track is set to a unique `Unknown` instance

        Returns
        -------
        anonymized : `Annotation`
            Anonymized annotation

        """
        anonymized = self.empty()
        for s, t, _ in self.itertracks(label=True):
            anonymized[s, t] = Unknown()
        return anonymized

    def smooth(self):
        """Smooth annotation

        Create new annotation where contiguous tracks with same label are
        merged into one longer track.

        Returns
        -------
        annotation : Annotation
            New annotation where contiguous tracks with same label are merged
            into one long track.

        Remarks
        -------
            Track names are lost in the process.

        """

        smoothed = self.empty()

        n = 0
        for label in self.labels():
            coverage = self.label_coverage(label)
            for segment in coverage:
                smoothed[segment, n] = label
                n = n+1

        return smoothed

    def co_iter(self, other):
        """
        Parameters
        ----------
        other : Annotation

        Generates
        ---------
        (segment, track), (other_segment, other_track)
        """

        for s, S in self.get_timeline().co_iter(other.get_timeline()):
            tracks = self.get_tracks(s)
            other_tracks = other.get_tracks(S)
            for t, T in itertools.product(tracks, other_tracks):
                yield (s, t), (S, T)

    def for_json(self):
        data = {
            PYANNOTE_JSON_ANNOTATION: [
                [s.for_json(), t, l] for s, t, l in self.itertracks(label=True)
            ],
        }
        if self.uri:
            data[PYANNOTE_URI] = self.uri
        if self.modality:
            data[PYANNOTE_MODALITY] = self.modality
        return data

    @classmethod
    def from_json(cls, data):
        uri = data.get(PYANNOTE_URI, None)
        modality = data.get(PYANNOTE_MODALITY, None)
        annotation = cls(uri=uri, modality=modality)
        for s, track, label in data[PYANNOTE_JSON_ANNOTATION]:
            segment = Segment.from_json(s)
            annotation[segment, track] = label
        return annotation

    def _repr_png_(self):
        from pyannote.core.notebook import repr_annotation
        return repr_annotation(self)
Пример #5
0
class Annotation(object):
    """Annotation

    Parameters
    ----------
    uri : string, optional
        uniform resource identifier of annotated document
    modality : string, optional
        name of annotated modality
    """

    @classmethod
    def from_df(cls, df, uri=None, modality=None):
        """

        Parameters
        ----------
        df : DataFrame
            Must contain the following columns: 'segment', 'track' and 'label'
        uri : str, optional
            Resource identifier
        modality : str, optional
            Modality

        Returns
        -------

        """

        df = df[[PYANNOTE_SEGMENT, PYANNOTE_TRACK, PYANNOTE_LABEL]]

        annotation = cls(uri=uri, modality=modality)

        annotation._tracks = SortedDict(key_type=(float, float), updator=TimelineUpdator)

        for row in df.itertuples():
            if row[1] in annotation._tracks:
                annotation._tracks[row[1]][row[2]] = row[3]
            else:
                annotation._tracks[row[1]] = {row[2]: row[3]}

        annotation._labels = {label: None for label in df["label"].unique()}
        annotation._labelNeedsUpdate = {label: True for label in annotation._labels}

        annotation._timeline = None
        annotation._timelineNeedsUpdate = True

        return annotation

    def __init__(self, uri=None, modality=None):

        super(Annotation, self).__init__()

        self._uri = uri
        self.modality = modality

        # sorted dictionary
        # keys: annotated segments
        # values: {track: label} dictionary
        self._tracks = SortedDict(key_type=(float, float), updator=TimelineUpdator)

        # dictionary
        # key: label
        # value: timeline
        self._labels = {}
        self._labelNeedsUpdate = {}

        # timeline meant to store all annotated segments
        self._timeline = None
        self._timelineNeedsUpdate = True

    def _get_uri(self):
        return self._uri

    def _set_uri(self, uri):
        # update uri for all internal timelines
        for _, timeline in six.iteritems(self._labels):
            timeline.uri = uri
        self._uri = uri

    uri = property(_get_uri, fset=_set_uri, doc="Resource identifier")

    def _updateLabels(self):

        # list of labels that needs to be updated
        update = set(label for label, update in self._labelNeedsUpdate.items() if update)

        # accumulate segments for updated labels
        _segments = {label: [] for label in update}
        for segment, track, label in self.itertracks(label=True):
            if label in update:
                _segments[label].append(segment)

        # create timeline with accumulated segments for updated labels
        for label in update:
            if _segments[label]:
                self._labels[label] = Timeline(segments=_segments[label], uri=self.uri)
                self._labelNeedsUpdate[label] = False
            else:
                self._labels.pop(label)
                self._labelNeedsUpdate.pop(label)

    def __len__(self):
        """Number of segments"""
        return self._tracks.length()

    def __bool__(self):
        return self._tracks.length() > 0

    def __nonzero__(self):
        return self.__bool__()

    def itersegments(self):
        """Segment iterator"""
        return iter(self._tracks)

    def itertracks(self, label=False):
        for segment, tracks in self._tracks.items():
            for track, lbl in sorted(six.iteritems(tracks), key=lambda tl: (str(tl[0]), str(tl[1]))):
                if label:
                    yield segment, track, lbl
                else:
                    yield segment, track

    def _updateTimeline(self):
        self._timeline = Timeline(segments=self._tracks, uri=self.uri)
        self._timelineNeedsUpdate = False

    def get_timeline(self, copy=True):
        """Get timeline made of annotated segments"""
        if self._timelineNeedsUpdate:
            self._updateTimeline()
        if copy:
            return self._timeline.copy()
        return self._timeline

    def __eq__(self, other):
        pairOfTracks = six.moves.zip_longest(self.itertracks(label=True), other.itertracks(label=True))
        return all(t1 == t2 for t1, t2 in pairOfTracks)

    def __ne__(self, other):
        pairOfTracks = six.moves.zip_longest(self.itertracks(label=True), other.itertracks(label=True))

        return any(t1 != t2 for t1, t2 in pairOfTracks)

    def __contains__(self, included):
        """Inclusion

        Use expression 'segment in annotation' or 'timeline in annotation'

        Parameters
        ----------
        included : `Segment` or `Timeline`

        Returns
        -------
        contains : bool
            True if every segment in `included` exists in annotation
            False otherwise

        """
        return included in self.get_timeline(copy=False)

    def crop(self, other, mode="intersection"):
        """Crop annotation

        Parameters
        ----------
        other : `Segment` or `Timeline`

        mode : {'strict', 'loose', 'intersection'}
            In 'strict' mode, only segments fully included in focus coverage
            are kept. In 'loose' mode, any intersecting segment is kept
            unchanged. In 'intersection' mode, only intersecting segments are
            kept and replaced by their actual intersection with the focus.

        Returns
        -------
        cropped : Annotation

        Remarks
        -------
        In 'intersection' mode, the best is done to keep the track names
        unchanged. However, in some cases where two original segments are
        cropped into the same resulting segments, conflicting track names are
        modified to make sure no track is lost.
        """

        if isinstance(other, Segment):
            other = Timeline(segments=[other], uri=self.uri)
            cropped = self.crop(other, mode=mode)

        elif isinstance(other, Timeline):

            cropped = self.__class__(uri=self.uri, modality=self.modality)

            if mode == "loose":
                # TODO
                # update co_iter to yield (segment, tracks), (segment, tracks)
                # instead of segment, segment
                # This would avoid calling ._tracks.get(segment)
                for segment, _ in self.get_timeline(copy=False).co_iter(other):
                    for track, label in six.iteritems(self._tracks[segment]):
                        cropped[segment, track] = label

            elif mode == "strict":
                # TODO
                # see above
                for segment, other_segment in self.get_timeline(copy=False).co_iter(other):

                    if segment in other_segment:
                        for track, label in six.iteritems(self._tracks[segment]):
                            cropped[segment, track] = label

            elif mode == "intersection":
                # see above
                for segment, other_segment in self.get_timeline(copy=False).co_iter(other):

                    intersection = segment & other_segment
                    for track, label in six.iteritems(self._tracks[segment]):
                        track = cropped.new_track(intersection, candidate=track)
                        cropped[intersection, track] = label

            else:
                raise NotImplementedError("unsupported mode: '%s'" % mode)

        return cropped

    def get_tracks(self, segment):
        """Set of tracks for query segment

        Parameters
        ----------
        segment : `Segment`
            Query segment

        Returns
        -------
        tracks : set
            Set of tracks for query segment
        """
        return set(self._tracks.get(segment, {}))

    def has_track(self, segment, track):
        """Check whether a given track exists

        Parameters
        ----------
        segment : `Segment`
            Query segment
        track :
            Query track

        Returns
        -------
        exists : bool
            True if track exists for segment
        """
        return track in self._tracks.get(segment, {})

    def get_track_by_name(self, track):
        """Get all tracks with given name

        Parameters
        ----------
        track : any valid track name
            Requested name track

        Returns
        -------
        tracks : list
            List of (segment, track) tuples
        """
        raise NotImplementedError("")

    def copy(self):

        # create new empty annotation
        copied = self.__class__(uri=self.uri, modality=self.modality)

        # deep copy internal track dictionary
        _tracks = [(key, dict(value)) for (key, value) in self._tracks.items()]
        copied._tracks = SortedDict(items=_tracks, key_type=(float, float), updator=TimelineUpdator)

        # deep copy internal label timelines
        _labels = {key: timeline.copy() for (key, timeline) in six.iteritems(self._labels)}
        copied._labels = _labels

        # deep copy need-update indicator
        copied._labelNeedsUpdate = dict(self._labelNeedsUpdate)

        copied._timelineNeedsUpdate = True

        return copied

    def retrack(self):
        """
        """
        retracked = self.__class__(uri=self.uri, modality=self.modality)
        for n, (s, _, label) in enumerate(self.itertracks(label=True)):
            retracked[s, n] = label
        return retracked

    def new_track(self, segment, candidate=None, prefix=None):
        """Track name generator

        Parameters
        ----------
        segment : Segment
        prefix : str, optional
        candidate : any valid track name

        Returns
        -------
        track : str
            New track name
        """

        # obtain list of existing tracks for segment
        existing_tracks = set(self._tracks.get(segment, {}))

        # if candidate is provided, check whether it already exists
        # in case it does not, use it
        if (candidate is not None) and (candidate not in existing_tracks):
            return candidate

        # no candidate was provided or the provided candidate already exists
        # we need to create a brand new one

        # by default (if prefix is not provided), use ''
        if prefix is None:
            prefix = ""

        # find first non-existing track name for segment
        # eg. if '0' exists, try '1', then '2', ...
        count = 0
        while ("%s%d" % (prefix, count)) in existing_tracks:
            count += 1

        # return first non-existing track name
        return "%s%d" % (prefix, count)

    def __str__(self):
        """Human-friendly representation"""
        # TODO: use pandas.DataFrame
        return "\n".join(["%s %s %s" % (s, t, l) for s, t, l in self.itertracks(label=True)])

    def __delitem__(self, key):

        # del annotation[segment]
        if isinstance(key, Segment):

            # Pop segment out of dictionary
            # and get corresponding tracks
            # Raises KeyError if segment does not exist
            tracks = self._tracks.pop(key)

            # mark timeline as modified
            self._timelineNeedsUpdate = True

            # mark every label in tracks as modified
            for track, label in six.iteritems(tracks):
                self._labelNeedsUpdate[label] = True

        # del annotation[segment, track]
        elif isinstance(key, tuple) and len(key) == 2:

            # get segment tracks as dictionary
            # if segment does not exist, get empty dictionary
            # Raises KeyError if segment does not exist
            tracks = self._tracks[key[0]]

            # pop track out of tracks dictionary
            # and get corresponding label
            # Raises KeyError if track does not exist
            label = tracks.pop(key[1])

            # mark label as modified
            self._labelNeedsUpdate[label] = True

            # if tracks dictionary is now empty,
            # remove segment as well
            if not tracks:
                self._tracks.pop(key[0])
                self._timelineNeedsUpdate = True

        else:
            raise KeyError("")

    # label = annotation[segment, track]
    def __getitem__(self, key):

        if isinstance(key, Segment):
            key = (key, "_")

        return self._tracks[key[0]][key[1]]

    # annotation[segment, track] = label
    def __setitem__(self, key, label):

        if isinstance(key, Segment):
            key = (key, "_")

        segment, track = key

        # do not add empty track
        if not segment:
            return

        # in case we create a new segment
        # mark timeline as modified
        if segment not in self._tracks:
            self._tracks[segment] = {}
            self._timelineNeedsUpdate = True

        # in case we modify an existing track
        # mark old label as modified
        if track in self._tracks[segment]:
            old_label = self._tracks[segment][track]
            self._labelNeedsUpdate[old_label] = True

        # mark new label as modified
        self._tracks[segment][track] = label
        self._labelNeedsUpdate[label] = True

    def empty(self):
        return self.__class__(uri=self.uri, modality=self.modality)

    def labels(self):
        """List of labels

        Returns
        -------
        labels : list
            Sorted list of labels
        """
        if any([lnu for lnu in self._labelNeedsUpdate.values()]):
            self._updateLabels()
        return sorted(self._labels, key=str)

    def get_labels(self, segment, unique=True):
        """Local set of labels

        Parameters
        ----------
        segment : Segment
            Segments to get label from.
        unique : bool, optional
            When False, return the list of (possibly repeated) labels.
            When True (default), return the set of labels
        Returns
        -------
        labels : set
            Set of labels for `segment` if it exists, empty set otherwise.

        Examples
        --------
        >>> annotation = Annotation()
        >>> segment = Segment(0, 2)
        >>> annotation[segment, 'speaker1'] = 'Bernard'
        >>> annotation[segment, 'speaker2'] = 'John'
        >>> print sorted(annotation.get_labels(segment))
        set(['Bernard', 'John'])
        >>> print annotation.get_labels(Segment(1, 2))
        set([])

        """

        labels = self._tracks.get(segment, {}).values()

        if unique:
            return set(labels)

        return labels

    def subset(self, labels, invert=False):
        """Annotation subset

        Extract annotation subset based on labels

        Parameters
        ----------
        labels : iterable
            Label iterable.
        invert : bool, optional
            If invert is True, extract all but requested `labels`

        Returns
        -------
        subset : `Annotation`
            Annotation subset.
        """

        labels = set(labels)

        if invert:
            labels = set(self.labels()) - labels
        else:
            labels = labels & set(self.labels())

        sub = self.__class__(uri=self.uri, modality=self.modality)
        for segment, track, label in self.itertracks(label=True):
            if label in labels:
                sub[segment, track] = label

        return sub

    def update(self, annotation, copy=False):
        """Update existing annotations or create new ones

        Parameters
        ----------
        annotation : Annotation
            Updated (or new) annotations
        copy : bool, optional
            Create a copy before updating. Defaults to False.

        Returns
        -------
        updated : Annotation
            Updated annotations.
        """

        result = self.copy() if copy else self
        for segment, track, label in annotation.itertracks(label=True):
            result[segment, track] = label
        return result

    def label_timeline(self, label, copy=True):
        """Get timeline for a given label

        Parameters
        ----------
        label :
        copy : bool, optional
            Defaults to True.

        Returns
        -------
        timeline : :class:`Timeline`
            Timeline made of all segments annotated with `label`

        """
        if label not in self.labels():
            return Timeline(uri=self.uri)

        if self._labelNeedsUpdate[label]:
            self._updateLabels()

        if copy:
            return self._labels[label].copy()

        return self._labels[label]

    def label_coverage(self, label):
        """Return label coverage (or support)

        Parameters
        ----------
        label : any valid label

        Returns
        -------
        coverage : Timeline

        """
        return self.label_timeline(label, copy=False).coverage()

    def label_duration(self, label):
        return self.label_timeline(label, copy=False).duration()

    def chart(self, percent=False):
        """
        Label chart based on their duration

        Parameters
        ----------
        percent : bool, optional
            Return total duration percentage (rather than raw duration)

        Returns
        -------
        chart : (label, duration) iterable
            Sorted from longest to shortest.

        """

        chart = sorted(
            [(label, self.label_duration(label)) for label in self.labels()], key=lambda x: x[1], reverse=True
        )

        if percent:
            total = np.sum([duration for _, duration in chart])
            chart = [(label, duration / total) for (label, duration) in chart]

        return chart

    def argmax(self, segment=None):
        """Get most frequent label

        Parameters
        ----------
        segment : Segment, optional
            Section of annotation where to look for the most frequent label.
            Defaults to whole annotation extent.

        Returns
        -------
        label : any existing label or None
            Label with longest intersection

        Examples
        --------

            >>> annotation = Annotation(modality='speaker')
            >>> annotation[Segment(0, 10), 'speaker1'] = 'Alice'
            >>> annotation[Segment(8, 20), 'speaker1'] = 'Bob'
            >>> print "%s is such a talker!" % annotation.argmax()
            Bob is such a talker!
            >>> segment = Segment(22, 23)
            >>> if not annotation.argmax(segment):
            ...    print "No label intersecting %s" % segment
            No label intersection [22 --> 23]

        """

        # if annotation is empty, obviously there is no most frequent label
        if not self:
            return None

        # if segment is not provided, just look for the overall most frequent
        # label (ie. set segment to the extent of the annotation)
        if segment is None:
            segment = self.get_timeline(copy=False).extent()

        # compute intersection duration for each label
        durations = {
            lbl: self.label_timeline(lbl, copy=False).crop(segment, mode="intersection").duration()
            for lbl in self.labels()
        }

        # find the most frequent label
        label = max(six.iteritems(durations), key=operator.itemgetter(1))[0]

        # in case all durations were zero, there is no most frequent label
        return label if durations[label] > 0 else None

    def translate(self, translation):
        """Translate labels

        Parameters
        ----------
        translation: dict
            Label translation.
            Labels with no associated translation are kept unchanged.

        Returns
        -------
        translated : :class:`Annotation`
            New annotation with translated labels.
        """

        assert isinstance(translation, dict)

        # create an empty copy
        translated = self.empty()

        for segment, track, label in self.itertracks(label=True):
            # only transform labels that have an actual translation
            # in the provided dictionary, keep the others as they are.
            translated[segment, track] = translation.get(label, label)

        return translated

    def __mod__(self, translation):
        return self.translate(translation)

    def anonymize_labels(self, generator="string"):
        """Anonmyize labels

        Create a new annotation where labels are anonymized.

        Parameters
        ----------
        generator : {'string', 'int', iterator}, optional

        Returns
        -------
        anonymized : :class:`Annotation`
            New annotation with anonymized labels.

        """

        if generator == "string":
            generator = string_generator()
        elif generator == "int":
            generator = int_generator()

        mapping = {label: next(generator) for label in self.labels()}
        return self.translate(mapping)

    def anonymize_tracks(self, generator="string"):
        """
        Anonymize tracks

        Create a new annotation where each track has a unique label.

        Parameters
        ----------
        generator : {'string', 'int', iterator}, optional
            Default to 'string'.

        Returns
        -------
        anonymized : `Annotation`
            New annotation with anonymized tracks.

        """

        if generator == "string":
            generator = string_generator()
        elif generator == "int":
            generator = int_generator()

        anonymized = self.empty()
        for s, t, _ in self.itertracks(label=True):
            anonymized[s, t] = next(generator)

        return anonymized

    def smooth(self, collar=0.0):
        """Smooth annotation

        Create new annotation where contiguous tracks with same label are
        merged into one longer track.

        Parameters
        ----------
        collar : float
            If collar is positive, also merge tracks separated by less than
            collar duration.

        Returns
        -------
        annotation : Annotation
            New annotation where contiguous tracks with same label are merged
            into one long track.

        Remarks
        -------
            Track names are lost in the process.
        """

        # initialize an empty annotation
        # with same uri and modality as original
        smoothed = self.empty()
        for label in self.labels():

            # get timeline for current label
            timeline = self.label_timeline(label, copy=True)

            # fill the gaps shorter than collar
            if collar > 0.0:
                gaps = timeline.gaps()
                for gap in gaps:
                    if gap.duration < collar:
                        timeline.add(gap)

            # reconstruct annotation with merged tracks
            for segment in timeline.coverage():
                track = smoothed.new_track(segment)
                smoothed[segment, track] = label

        # return
        return smoothed

    def co_iter(self, other):
        """
        Parameters
        ----------
        other : Annotation

        Generates
        ---------
        (segment, track), (other_segment, other_track)
        """
        timeline = self.get_timeline(copy=False)
        other_timeline = other.get_timeline(copy=False)
        for s, S in timeline.co_iter(other_timeline):
            tracks = sorted(self.get_tracks(s), key=str)
            other_tracks = sorted(other.get_tracks(S), key=str)
            for t, T in itertools.product(tracks, other_tracks):
                yield (s, t), (S, T)

    def __mul__(self, other):
        """Compute cooccurrence matrix"""

        i = self.labels()
        j = other.labels()

        matrix = DataArray(np.zeros((len(i), len(j))), coords=[("i", i), ("j", j)])

        for (segment, track), (other_segment, other_track) in self.co_iter(other):
            label = self[segment, track]
            other_label = other[other_segment, other_track]
            duration = (segment & other_segment).duration
            matrix.loc[label, other_label] += duration

        return matrix

    def for_json(self):

        data = {PYANNOTE_JSON: self.__class__.__name__}
        content = [
            {PYANNOTE_SEGMENT: s.for_json(), PYANNOTE_TRACK: t, PYANNOTE_LABEL: l}
            for s, t, l in self.itertracks(label=True)
        ]
        data[PYANNOTE_JSON_CONTENT] = content

        if self.uri:
            data[PYANNOTE_URI] = self.uri

        if self.modality:
            data[PYANNOTE_MODALITY] = self.modality

        return data

    @classmethod
    def from_json(cls, data):

        uri = data.get(PYANNOTE_URI, None)
        modality = data.get(PYANNOTE_MODALITY, None)
        annotation = cls(uri=uri, modality=modality)
        for one in data[PYANNOTE_JSON_CONTENT]:
            segment = Segment.from_json(one[PYANNOTE_SEGMENT])
            track = one[PYANNOTE_TRACK]
            label = one[PYANNOTE_LABEL]
            annotation[segment, track] = label

        return annotation

    def _repr_png_(self):
        from .notebook import repr_annotation

        return repr_annotation(self)
Пример #6
0
class Annotation(object):
    """Annotation

    Parameters
    ----------
    uri : string, optional
        uniform resource identifier of annotated document
    modality : string, optional
        name of annotated modality

    """

    @classmethod
    def from_df(cls, df, uri=None, modality=None):
        """

        Parameters
        ----------
        df : DataFrame
            Must contain the following columns: 'segment', 'track' and 'label'
        uri : str, optional
            Resource identifier
        modality : str, optional
            Modality

        Returns
        -------

        """
        annotation = cls(uri=uri, modality=modality)
        for _, (segment, track, label) in df[
            [PYANNOTE_SEGMENT,
             PYANNOTE_TRACK,
             PYANNOTE_LABEL]
        ].iterrows():
            annotation[segment, track] = label

        return annotation

    def __init__(self, uri=None, modality=None):
        super(Annotation, self).__init__()

        self._uri = uri
        self.modality = modality

        # sorted dictionary
        # keys: annotated segments
        # values: {track: label} dictionary
        self._tracks = SortedDict(key_type=(float, float),
                                  updator=TimelineUpdator)

        # dictionary
        # key: label
        # value: timeline
        self._labels = {}
        self._labelNeedsUpdate = {}

        # timeline meant to store all annotated segments
        self._timeline = Timeline(uri=uri)
        self._timelineNeedsUpdate = True

    def _get_uri(self):
        return self._uri

    def _set_uri(self, uri):
        # update uri for all internal timelines
        for _, timeline in self._labels.iteritems():
            timeline.uri = uri
        self._uri = uri

    uri = property(_get_uri, fset=_set_uri, doc="Resource identifier")

    def _updateLabels(self):

        # (re-)initialize changed label timeline
        for l, needsUpdate in self._labelNeedsUpdate.iteritems():
            if needsUpdate:
                self._labels[l] = Timeline(uri=self.uri)

        # fill changed label timeline
        for segment, track, l in self.itertracks(label=True):
            if self._labelNeedsUpdate[l]:
                self._labels[l].add(segment)

        self._labelNeedsUpdate = {l: False for l in self._labels}

        # remove "ghost" labels (i.e. label with empty timeline)
        labels = self._labels.keys()
        for l in labels:
            if not self._labels[l]:
                self._labels.pop(l)
                self._labelNeedsUpdate.pop(l)

    def __len__(self):
        """Number of segments"""
        return self._tracks.length()

    def __nonzero__(self):
        return self._tracks.length() > 0

    def itersegments(self):
        """Segment iterator"""
        return iter(self._tracks)

    def itertracks(self, label=False):
        for segment, tracks in self._tracks.items():
            for track, lbl in tracks.iteritems():
                if label:
                    yield segment, track, lbl
                else:
                    yield segment, track

    def _updateTimeline(self):
        self._timeline = Timeline(segments=self._tracks, uri=self.uri)
        self._timelineNeedsUpdate = False

    def get_timeline(self):
        """Get timeline made of annotated segments"""
        if self._timelineNeedsUpdate:
            self._updateTimeline()
        return self._timeline

    def __eq__(self, other):
        return self._tracks == other._tracks

    def __ne__(self, other):
        return self._tracks != other._tracks

    def __contains__(self, included):
        """Inclusion

        Use expression 'segment in annotation' or 'timeline in annotation'

        Parameters
        ----------
        included : `Segment` or `Timeline`

        Returns
        -------
        contains : bool
            True if every segment in `included` exists in annotation
            False otherwise

        """
        return included in self.get_timeline()

    def crop(self, other, mode='intersection'):
        """Crop annotation

        Parameters
        ----------
        other : `Segment` or `Timeline`

        mode : {'strict', 'loose', 'intersection'}
            In 'strict' mode, only segments fully included in focus coverage
            are kept. In 'loose' mode, any intersecting segment is kept
            unchanged. In 'intersection' mode, only intersecting segments are
            kept and replaced by their actual intersection with the focus.

        Returns
        -------
        cropped : Annotation

        Remarks
        -------
        In 'intersection' mode, the best is done to keep the track names
        unchanged. However, in some cases where two original segments are
        cropped into the same resulting segments, conflicting track names are
        modified to make sure no track is lost.
        """

        if isinstance(other, Segment):
            other = Timeline(segments=[other], uri=self.uri)
            cropped = self.crop(other, mode=mode)

        elif isinstance(other, Timeline):

            cropped = self.__class__(uri=self.uri, modality=self.modality)

            if mode == 'loose':
                # TODO
                # update co_iter to yield (segment, tracks), (segment, tracks)
                # instead of segment, segment
                # This would avoid calling ._tracks.get(segment)
                for segment, _ in self.get_timeline().co_iter(other):
                    for track, label in self._tracks[segment].iteritems():
                        cropped[segment, track] = label

            elif mode == 'strict':
                # TODO
                # see above
                for segment, other_segment in self.get_timeline().co_iter(other):
                    if segment in other_segment:
                        for track, label in self._tracks[segment].iteritems():
                            cropped[segment, track] = label

            elif mode == 'intersection':
                # see above
                for segment, other_segment in self.get_timeline().co_iter(other):
                    intersection = segment & other_segment
                    for track, label in self._tracks[segment].iteritems():
                        track = cropped.new_track(intersection,
                                                  candidate=track)
                        cropped[intersection, track] = label

            else:
                raise NotImplementedError("unsupported mode: '%s'" % mode)

        return cropped

    def get_tracks(self, segment):
        """Set of tracks for query segment

        Parameters
        ----------
        segment : `Segment`
            Query segment

        Returns
        -------
        tracks : set
            Set of tracks for query segment
        """
        return set(self._tracks.get(segment, {}))

    def has_track(self, segment, track):
        """Check whether a given track exists

        Parameters
        ----------
        segment : `Segment`
            Query segment
        track :
            Query track

        Returns
        -------
        exists : bool
            True if track exists for segment
        """
        return track in self._tracks.get(segment, {})

    def get_track_by_name(self, track):
        """Get all tracks with given name

        Parameters
        ----------
        track : any valid track name
            Requested name track

        Returns
        -------
        tracks : list
            List of (segment, track) tuples
        """
        raise NotImplementedError('')

    def copy(self):

        # create new empty annotation
        copied = self.__class__(uri=self.uri, modality=self.modality)

        # deep copy internal track dictionary
        _tracks = [(key, dict(value)) for (key, value) in self._tracks.items()]
        copied._tracks = SortedDict(items=_tracks,
                                    key_type=(float, float),
                                    updator=TimelineUpdator)

        # deep copy internal label timelines
        _labels = {key: timeline.copy()
                   for (key, timeline) in self._labels.iteritems()}
        copied._labels = _labels

        # deep copy need-update indicator
        copied._labelNeedsUpdate = dict(self._labelNeedsUpdate)

        copied._timelineNeedsUpdate = self._timelineNeedsUpdate

        return copied

    def retrack(self):
        """
        """
        retracked = self.__class__(uri=self.uri, modality=self.modality)
        for n, (s, _, label) in enumerate(self.itertracks(label=True)):
            retracked[s, n] = label
        return retracked

    def new_track(self, segment, candidate=None, prefix=None):
        """Track name generator

        Parameters
        ----------
        segment : Segment
        prefix : str, optional
        candidate : any valid track name

        Returns
        -------
        track : str
            New track name
        """

        # obtain list of existing tracks for segment
        existing_tracks = set(self._tracks.get(segment, {}))

        # if candidate is provided, check whether it already exists
        # in case it does not, use it
        if (candidate is not None) and (candidate not in existing_tracks):
            return candidate

        # no candidate was provided or the provided candidate already exists
        # we need to create a brand new one

        # by default (if prefix is not provided), use ''
        if prefix is None:
            prefix = ''

        # find first non-existing track name for segment
        # eg. if '0' exists, try '1', then '2', ...
        count = 0
        while ('%s%d' % (prefix, count)) in existing_tracks:
            count += 1

        # return first non-existing track name
        return '%s%d' % (prefix, count)

    def __str__(self):
        """Human-friendly representation"""
        # TODO: use pandas.DataFrame
        return "\n".join(["%s %s %s" % (s, t, l)
                          for s, t, l in self.itertracks(label=True)])

    def __delitem__(self, key):

        # del annotation[segment]
        if isinstance(key, Segment):

            # Pop segment out of dictionary
            # and get corresponding tracks
            # Raises KeyError if segment does not exist
            tracks = self._tracks.pop(key)

            # mark timeline as modified
            self._timelineNeedsUpdate = True

            # mark every label in tracks as modified
            for track, label in tracks.iteritems():
                self._labelNeedsUpdate[label] = True

        # del annotation[segment, track]
        elif isinstance(key, tuple) and len(key) == 2:

            # get segment tracks as dictionary
            # if segment does not exist, get empty dictionary
            # Raises KeyError if segment does not exist
            tracks = self._tracks[key[0]]

            # pop track out of tracks dictionary
            # and get corresponding label
            # Raises KeyError if track does not exist
            label = tracks.pop(key[1])

            # mark label as modified
            self._labelNeedsUpdate[label] = True

            # if tracks dictionary is now empty,
            # remove segment as well
            if not tracks:
                self._tracks.pop(key[0])
                self._timelineNeedsUpdate = True

        else:
            raise KeyError('')

    # label = annotation[segment, track]
    def __getitem__(self, key):

        if isinstance(key, Segment):
            key = (key, '_')

        return self._tracks[key[0]][key[1]]

    # annotation[segment, track] = label
    def __setitem__(self, key, label):

        if isinstance(key, Segment):
            key = (key, '_')

        if key[0] not in self._tracks:
            self._tracks[key[0]] = {}
            self._timelineNeedsUpdate = True

        self._tracks[key[0]][key[1]] = label
        self._labelNeedsUpdate[label] = True

    def empty(self):
        return self.__class__(uri=self.uri, modality=self.modality)

    @staticmethod
    def _cmp_labels(label1, label2):
        # unknown > not_unknown
        # otherwise, just use regular cmp
        u1 = isinstance(label1, Unknown)
        u2 = isinstance(label2, Unknown)
        if u1 == u2:
            return cmp(label1, label2)
        return u1 - u2

    def labels(self, unknown=True):
        """List of labels

        Parameters
        ----------
        unknown : bool, optional
            When False, do not return Unknown instances
            When True, return any label (even Unknown instances)

        Returns
        -------
        labels : list
            Sorted list of labels
        """

        if any([lnu for lnu in self._labelNeedsUpdate.values()]):
            self._updateLabels()

        labels = sorted(self._labels, cmp=self._cmp_labels)

        if not unknown:
            labels = [l for l in labels if not isinstance(l, Unknown)]

        return labels

    def get_labels(self, segment, unknown=True, unique=True):
        """Local set of labels

        Parameters
        ----------
        segment : Segment
            Segments to get label from.
        unknown : bool, optional
            When False, do not return Unknown instances
            When True, return any label (even Unknown instances)
        unique : bool, optional
            When False, return the list of (possibly repeated) labels.
            When True (default), return the set of labels
        Returns
        -------
        labels : set
            Set of labels for `segment` if it exists, empty set otherwise.

        Examples
        --------

            >>> annotation = Annotation()
            >>> segment = Segment(0, 2)
            >>> annotation[segment, 'speaker1'] = 'Bernard'
            >>> annotation[segment, 'speaker2'] = 'John'
            >>> print sorted(annotation.get_labels(segment))
            set(['Bernard', 'John'])
            >>> print annotation.get_labels(Segment(1, 2))
            set([])

        """

        labels = self._tracks.get(segment, {}).values()

        if not unknown:
            labels = [l for l in labels if not isinstance(l, Unknown)]

        if unique:
            labels = set(labels)

        return labels

    def subset(self, labels, invert=False):
        """Annotation subset

        Extract annotation subset based on labels

        Parameters
        ----------
        labels : set
            Set of labels
        invert : bool, optional
            If invert is True, extract all but requested `labels`

        Returns
        -------
        subset : `Annotation`
            Annotation subset.
        """

        if not isinstance(labels, set):
            raise TypeError('labels must be provided as a set of labels.')

        if invert:
            labels = set(self.labels()) - labels
        else:
            labels = labels & set(self.labels())

        sub = self.__class__(uri=self.uri, modality=self.modality)
        for segment, track, label in self.itertracks(label=True):
            if label in labels:
                sub[segment, track] = label

        return sub

    def label_timeline(self, label):
        """Get timeline for a given label

        Parameters
        ----------
        label :

        Returns
        -------
        timeline : :class:`Timeline`
            Timeline made of all segments annotated with `label`

        """
        if label not in self.labels():
            return Timeline(uri=self.uri)

        if self._labelNeedsUpdate[label]:
            self._updateLabels()

            for l, hasChanged in self._labelNeedsUpdate.iteritems():
                if hasChanged:
                    self._labels[l] = Timeline(uri=self.uri)

            for segment, track, l in self.itertracks(label=True):
                if self._labelNeedsUpdate[l]:
                    self._labels[l].add(segment)

            self._labelNeedsUpdate = {l: False for l in self._labels}

        return self._labels[label].copy()

    def label_coverage(self, label):
        """

        Parameters
        ----------
        label :

        Returns
        -------

        """
        if label not in self.labels():
            return Timeline(uri=self.uri)

        return self.label_timeline(label).coverage()

    def label_duration(self, label):

        if label not in self.labels():
            return 0.

        return self.label_timeline(label).duration()

    def chart(self, percent=False):
        """
        Label chart based on their duration

        Parameters
        ----------
        percent : bool, optional
            Return total duration percentage (rather than raw duration)

        Returns
        -------
        chart : (label, duration) iterable
            Sorted from longest to shortest.

        """

        chart = sorted([(label, self.label_duration(label))
                        for label in self.labels()],
                       key=lambda x: x[1], reverse=True)

        if percent:
            total = np.sum([duration for _, duration in chart])
            chart = [(label, duration/total) for (label, duration) in chart]

        return chart

    def argmax(self, segment=None, known_first=False):
        """Get most frequent label


        Parameters
        ----------
        segment : Segment, optional
            Section of annotation where to look for the most frequent label.
            Defaults to whole annotation extent.
        known_first: bool, optional
            If True, artificially reduces the duration of intersection of
            `Unknown` labels so that 'known' labels are returned first.

        Returns
        -------
        label : any existing label or None
            Label with longest intersection

        Examples
        --------

            >>> annotation = Annotation(modality='speaker')
            >>> annotation[Segment(0, 10), 'speaker1'] = 'Alice'
            >>> annotation[Segment(8, 20), 'speaker1'] = 'Bob'
            >>> print "%s is such a talker!" % annotation.argmax()
            Bob is such a talker!
            >>> segment = Segment(22, 23)
            >>> if not annotation.argmax(segment):
            ...    print "No label intersecting %s" % segment
            No label intersection [22 --> 23]

        """

        # if annotation is empty, obviously there is no most frequent label
        if not self:
            return None

        # if segment is not provided, just look for the overall most frequent
        # label (ie. set segment to the extent of the annotation)
        if segment is None:
            segment = self.get_timeline().extent()

        # compute intersection duration for each label
        durations = {lbl: self.label_timeline(lbl).crop(segment, mode='intersection').duration()
                     for lbl in self.labels()}

        # artifically reduce intersection duration of Unknown labels
        # so that 'known' labels are returned first
        if known_first:
            maxduration = max(durations.values())
            for lbl in durations.keys():
                if isinstance(lbl, Unknown):
                    durations[lbl] = durations[lbl] - maxduration

        # find the most frequent label
        label = max(durations.iteritems(), key=operator.itemgetter(1))[0]

        # in case all durations were zero, there is no most frequent label
        return label if durations[label] > 0 else None

    def translate(self, translation):
        """Translate labels

        Parameters
        ----------
        translation: dict
            Label translation.
            Labels with no associated translation are kept unchanged.

        Returns
        -------
        translated : :class:`Annotation`
            New annotation with translated labels.
        """

        assert isinstance(translation, dict)

        # create an empty copy
        translated = self.empty()

        for segment, track, label in self.itertracks(label=True):
            # only transform labels that have an actual translation
            # in the provided dictionary, keep the others as they are.
            translated[segment, track] = translation.get(label, label)

        return translated

    def __mod__(self, translation):
        return self.translate(translation)

    def anonymize_labels(self):
        """Anonmyize labels

        Create a new annotation where labels are anonymized, ie. each label
        is replaced by a unique `Unknown` instance.

        Returns
        -------
        anonymized : :class:`Annotation`
            New annotation with anonymized labels.

        """
        translation = {label: Unknown() for label in self.labels()}
        return self % translation

    def anonymize_tracks(self):
        """
        Anonymize tracks

        Create a new annotation where each track is anonymized, i.e. the label
        of each track is set to a unique `Unknown` instance

        Returns
        -------
        anonymized : `Annotation`
            Anonymized annotation

        """
        anonymized = self.empty()
        for s, t, _ in self.itertracks(label=True):
            anonymized[s, t] = Unknown()
        return anonymized

    def smooth(self, collar=0.):
        """Smooth annotation

        Create new annotation where contiguous tracks with same label are
        merged into one longer track.

        Parameters
        ----------
        collar : float
            If collar is positive, also merge tracks separated by less than
            collar duration.

        Returns
        -------
        annotation : Annotation
            New annotation where contiguous tracks with same label are merged
            into one long track.

        Remarks
        -------
            Track names are lost in the process.
        """

        # initialize an empty annotation
        # with same uri and modality as original
        smoothed = self.empty()
        for label in self.labels():

            # get timeline for current label
            timeline = self.label_timeline(label)

            # fill the gaps shorter than collar
            if collar > 0.:
                gaps = timeline.gaps()
                for gap in gaps:
                    if gap.duration < collar:
                        timeline.add(gap)

            # reconstruct annotation with merged tracks
            for segment in timeline.coverage():
                track = smoothed.new_track(segment)
                smoothed[segment, track] = label

        # return
        return smoothed

    def co_iter(self, other):
        """
        Parameters
        ----------
        other : Annotation

        Generates
        ---------
        (segment, track), (other_segment, other_track)
        """

        for s, S in self.get_timeline().co_iter(other.get_timeline()):
            tracks = self.get_tracks(s)
            other_tracks = other.get_tracks(S)
            for t, T in itertools.product(tracks, other_tracks):
                yield (s, t), (S, T)

    def for_json(self):
        data = {
            PYANNOTE_JSON_ANNOTATION: [
                [s.for_json(), t, l] for s, t, l in self.itertracks(label=True)
            ],
        }
        if self.uri:
            data[PYANNOTE_URI] = self.uri
        if self.modality:
            data[PYANNOTE_MODALITY] = self.modality
        return data

    @classmethod
    def from_json(cls, data):
        uri = data.get(PYANNOTE_URI, None)
        modality = data.get(PYANNOTE_MODALITY, None)
        annotation = cls(uri=uri, modality=modality)
        for segment, track, label in data[PYANNOTE_JSON_ANNOTATION]:
            annotation[segment, track] = label
        return annotation

    def _repr_png_(self):
        from pyannote.core.notebook import repr_annotation
        return repr_annotation(self)