Exemplo n.º 1
0
    def scores(self, segmentation, features):

        # create empty scores to hold all scores
        scores = Scores(uri=segmentation.uri, modality=segmentation.modality)

        # raw features data
        data = features.data

        # target scores
        targets_scores = []
        for target in self.targets:
            target_scores = self._apply_model(self._model[target], data)
            targets_scores.append(target_scores)
        targets_scores = np.vstack(targets_scores).T

        # background scores
        if hasattr(self, '_background'):
            targets_scores = self._apply_background(data, targets_scores)

        # TODO: make it work for any kind of features
        new_features = SlidingWindowFeature(
            targets_scores, features.sliding_window)

        for segment, track in segmentation.itertracks():
            x = self._aggregate_track_scores(new_features.crop(segment))
            for t, target in enumerate(self.targets):
                scores[segment, track, target] = x[t]

        return scores
Exemplo n.º 2
0
    def scores(self, segmentation, features):

        # create empty scores to hold all scores
        scores = Scores(uri=segmentation.uri, modality=segmentation.modality)

        # raw features data
        data = features.data

        # target scores
        targets_scores = []
        for target in self.targets:
            target_scores = self._apply_model(self._model[target], data)
            targets_scores.append(target_scores)
        targets_scores = np.vstack(targets_scores).T

        # background scores
        if hasattr(self, '_background'):
            targets_scores = self._apply_background(data, targets_scores)

        # TODO: make it work for any kind of features
        new_features = SlidingWindowFeature(targets_scores,
                                            features.sliding_window)

        for segment, track in segmentation.itertracks():
            x = self._aggregate_track_scores(new_features.crop(segment))
            for t, target in enumerate(self.targets):
                scores[segment, track, target] = x[t]

        return scores
Exemplo n.º 3
0
    def __call__(self, item):
        """Extract features

        Parameters
        ----------
        item : dict

        Returns
        -------
        features : SlidingWindowFeature

        """

        # --- load audio file
        y, sample_rate = read_audio(item,
                                    sample_rate=self.sample_rate,
                                    mono=True)

        data = self.process(y, sample_rate)

        if np.any(np.isnan(data)):
            uri = get_unique_identifier(item)
            msg = 'Features extracted from "{uri}" contain NaNs.'
            warnings.warn(msg.format(uri=uri))

        return SlidingWindowFeature(data.T, self.sliding_window_)
Exemplo n.º 4
0
    def extract(self, path):

        capture = cv2.VideoCapture(path)

        # frame size
        # height = int(capture.get(cv.CV_CAP_PROP_FRAME_HEIGHT))
        # width = int(capture.get(cv.CV_CAP_PROP_FRAME_WIDTH))

        # video "size"
        framePerSecond = capture.get(cv.CV_CAP_PROP_FPS)
        frameCount = int(capture.get(cv.CV_CAP_PROP_FRAME_COUNT))
        # duration = frameCount / framePerSecond

        data = np.NaN * np.ones((frameCount, self.get_dimension()))

        while True:

            f = int(capture.get(cv.CV_CAP_PROP_POS_FRAMES))

            success, frame = capture.read()
            if not success:
                break

            data[f, :] = self.process_frame(frame)

        duration = step = 1. / framePerSecond
        sliding_window = SlidingWindow(start=0., duration=duration, step=step)
        return SlidingWindowFeature(data, sliding_window)
Exemplo n.º 5
0
    def extract(self, wav):
        """Extract features

        Parameters
        ----------
        wav : string
            Path to wav file.

        Returns
        -------
        features : SlidingWindowFeature

        """

        # hack
        data_flow, stack = self.get_flow_and_stack()

        engine = yaafelib.Engine()
        engine.load(data_flow)

        sample_rate, raw_audio = scipy.io.wavfile.read(wav)
        assert sample_rate == self.sample_rate, "sample rate mismatch"

        audio = np.array(raw_audio, dtype=np.float64, order='C').reshape(1, -1)

        features = engine.processAudio(audio)
        data = np.hstack([features[name] for name in stack])

        sliding_window = YaafeFrame(blockSize=self.block_size,
                                    stepSize=self.step_size,
                                    sampleRate=self.sample_rate)

        return SlidingWindowFeature(data, sliding_window)
Exemplo n.º 6
0
    def __call__(self, wav):
        """Extract features

        Parameters
        ----------
        wav : string
            Path to wav file.

        Returns
        -------
        features : SlidingWindowFeature

        """

        definition = self.definition()

        # --- prepare the feature plan
        feature_plan = yaafelib.FeaturePlan(sample_rate=self.sample_rate)
        for name, recipe in definition:
            assert feature_plan.addFeature(
                "{name}: {recipe}".format(name=name, recipe=recipe))

        # --- prepare the Yaafe engine
        data_flow = feature_plan.getDataFlow()

        engine = yaafelib.Engine()
        engine.load(data_flow)

        sample_rate, raw_audio = scipy.io.wavfile.read(wav)
        assert sample_rate == self.sample_rate, "sample rate mismatch"

        audio = np.array(raw_audio, dtype=np.float64, order='C').reshape(1, -1)

        features = engine.processAudio(audio)
        data = np.hstack([features[name] for name, _ in definition])

        sliding_window = YaafeFrame(
            blockSize=self.block_size, stepSize=self.step_size,
            sampleRate=self.sample_rate)

        return SlidingWindowFeature(data, sliding_window)
Exemplo n.º 7
0
    def __call__(self, item):
        """Extract features

        Parameters
        ----------
        item : dict

        Returns
        -------
        features : SlidingWindowFeature

        """

        try:
            wav = item['wav']
            y, sample_rate, encoding = pysndfile.sndio.read(wav)
        except IOError as e:
            raise PyannoteFeatureExtractionError(e.message)

        if np.any(np.isnan(y)):
            uri = get_unique_identifier(item)
            msg = 'pysndfile output contains NaNs for file "{uri}".'
            raise PyannoteFeatureExtractionError(msg.format(uri=uri))

        # reshape before selecting channel
        if len(y.shape) < 2:
            y = y.reshape(-1, 1)

        channel = item.get('channel', 1)
        y = y[:, channel - 1]

        data = self.process(y, sample_rate)

        if np.any(np.isnan(data)):
            uri = get_unique_identifier(item)
            msg = 'Features extracted from "{uri}" contain NaNs.'
            warnings.warn(msg.format(uri=uri))

        return SlidingWindowFeature(data.T, self.sliding_window_)
Exemplo n.º 8
0
    def __call__(self, item):
        """Extract features

        Parameters
        ----------
        item : dict

        Returns
        -------
        features : SlidingWindowFeature

        """

        # --- load audio file
        y, sample_rate = read_audio(item,
                                    sample_rate=self.sample_rate,
                                    mono=True)

        # --- update data_flow every time sample rate changes
        if not hasattr(self,
                       'sample_rate_') or self.sample_rate_ != sample_rate:
            self.sample_rate_ = sample_rate
            feature_plan = yaafelib.FeaturePlan(sample_rate=self.sample_rate_)
            for name, recipe in self.definition():
                assert feature_plan.addFeature("{name}: {recipe}".format(
                    name=name, recipe=recipe))
            data_flow = feature_plan.getDataFlow()
            self.engine_.load(data_flow)

        # Yaafe needs this: float64, column-contiguous, 2-dimensional
        y = np.array(y, dtype=np.float64, order='C').reshape((1, -1))

        # --- extract features
        features = self.engine_.processAudio(y)
        data = np.hstack([features[name] for name, _ in self.definition()])

        # --- stack features
        n_samples, n_features = data.shape
        zero_padding = self.stack // 2
        if self.stack % 2 == 0:
            expanded_data = np.concatenate(
                (np.zeros((zero_padding, n_features)) + data[0], data,
                 np.zeros((zero_padding - 1, n_features)) + data[-1]))
        else:
            expanded_data = np.concatenate(
                (np.zeros((zero_padding, n_features)) + data[0], data,
                 np.zeros((zero_padding, n_features)) + data[-1]))

        data = np.lib.stride_tricks.as_strided(expanded_data,
                                               shape=(n_samples,
                                                      n_features * self.stack),
                                               strides=data.strides)

        self.engine_.reset()

        # --- return as SlidingWindowFeature
        if np.any(np.isnan(data)):
            uri = get_unique_identifier(item)
            msg = 'Features extracted from "{uri}" contain NaNs.'
            warnings.warn(msg.format(uri=uri))

        return SlidingWindowFeature(data, self.sliding_window_)
Exemplo n.º 9
0
    def post_process(self):
        # extract mfcc with yaafe and store them to be used with pyannote
        res_yaafe = self.parents['yaafe'].results['yaafe.mfccchop']
        mfcc = res_yaafe.data_object.value

        sw = YaafeFrame(self.input_blocksize, self.input_stepsize,
                        self.input_samplerate)
        pyannotefeat = SlidingWindowFeature(mfcc, sw)

        # gaussian divergence window size
        timestepsize = self.input_stepsize / float(self.input_samplerate)
        gdiff_win_size_frame = int(self.gdiff_win_size_sec / timestepsize)
        min_seg_size_frame = int(self.min_seg_size_sec / timestepsize)

        # speech activity detection
        sad_analyzer = self.parents['sad_analyzer']
        res_sad = sad_analyzer.results['limsi_sad.sad_lhh_diff']
        sadval = res_sad.data_object.value[:]
        # indices of frames detected as speech
        speech_threshold = 0.
        frameids = [
            i for i, val in enumerate(sadval) if val > speech_threshold
        ]

        # compute gaussian divergence of speech frames only
        gdiff = gauss_div(mfcc[frameids, :], gdiff_win_size_frame)

        # initial segmentation based on gaussian divergence criterion
        seg = segment(gdiff, min_seg_size_frame)

        # Convert initial segmentation to pyannote annotation
        chunks = Annotation()
        fbegin = None

        lastframe = None
        ichunk = 0
        for segval, iframe in zip(seg, frameids):
            if segval == 1:
                if lastframe is not None:
                    chunks[pyannotefeat.sliding_window.rangeToSegment(
                        fbegin, iframe - fbegin)] = str(ichunk)
                    ichunk += 1
                fbegin = iframe
            elif iframe - 1 != lastframe:
                if lastframe is not None:
                    chunks[pyannotefeat.sliding_window.rangeToSegment(
                        fbegin, lastframe - fbegin + 1)] = str(ichunk)
                fbegin = iframe
            lastframe = iframe
        if lastframe != fbegin:
            chunks[pyannotefeat.sliding_window.rangeToSegment(
                fbegin, lastframe - fbegin + 1)] = str(ichunk)

        # performs BIC clustering
        bicClustering = BICClustering(covariance_type='full',
                                      penalty_coef=self.bic_penalty_coeff)
        hypothesis = bicClustering(chunks, feature=pyannotefeat)

        # get diarisation results
        tmplabel = [int(h[2]) for h in hypothesis.itertracks(True)]
        tmptime = [h[0].start for h in hypothesis.itertracks()]
        tmpduration = [h[0].duration for h in hypothesis.itertracks()]

        # merge adjacent clusters having same labels
        label = []
        time = []
        duration = []
        lastlabel = None
        for l, t, d in zip(tmplabel, tmptime, tmpduration):
            if l != lastlabel:
                label.append(l)
                duration.append(d)
                time.append(t)
            else:
                duration[-1] = t + d - time[-1]
            lastlabel = l

        # store diarisation result
        diar_res = self.new_result(data_mode='label', time_mode='segment')
        diar_res.id_metadata.id += '.' + 'speakers'  # + name + 'diarisation'
        diar_res.id_metadata.name += ' ' + 'speaker identifiers'  # name + 'diarisation'
        diar_res.data_object.label = label
        diar_res.data_object.time = time
        diar_res.data_object.duration = duration
        diar_res.data_object.label_metadata.label = dict()
        for lab in diar_res.data_object.label:
            diar_res.data_object.label_metadata.label[lab] = str(lab)

        self.add_result(diar_res)