def scores(self, segmentation, features): # create empty scores to hold all scores scores = Scores(uri=segmentation.uri, modality=segmentation.modality) # raw features data data = features.data # target scores targets_scores = [] for target in self.targets: target_scores = self._apply_model(self._model[target], data) targets_scores.append(target_scores) targets_scores = np.vstack(targets_scores).T # background scores if hasattr(self, '_background'): targets_scores = self._apply_background(data, targets_scores) # TODO: make it work for any kind of features new_features = SlidingWindowFeature( targets_scores, features.sliding_window) for segment, track in segmentation.itertracks(): x = self._aggregate_track_scores(new_features.crop(segment)) for t, target in enumerate(self.targets): scores[segment, track, target] = x[t] return scores
def scores(self, segmentation, features): # create empty scores to hold all scores scores = Scores(uri=segmentation.uri, modality=segmentation.modality) # raw features data data = features.data # target scores targets_scores = [] for target in self.targets: target_scores = self._apply_model(self._model[target], data) targets_scores.append(target_scores) targets_scores = np.vstack(targets_scores).T # background scores if hasattr(self, '_background'): targets_scores = self._apply_background(data, targets_scores) # TODO: make it work for any kind of features new_features = SlidingWindowFeature(targets_scores, features.sliding_window) for segment, track in segmentation.itertracks(): x = self._aggregate_track_scores(new_features.crop(segment)) for t, target in enumerate(self.targets): scores[segment, track, target] = x[t] return scores
def __call__(self, item): """Extract features Parameters ---------- item : dict Returns ------- features : SlidingWindowFeature """ # --- load audio file y, sample_rate = read_audio(item, sample_rate=self.sample_rate, mono=True) data = self.process(y, sample_rate) if np.any(np.isnan(data)): uri = get_unique_identifier(item) msg = 'Features extracted from "{uri}" contain NaNs.' warnings.warn(msg.format(uri=uri)) return SlidingWindowFeature(data.T, self.sliding_window_)
def extract(self, path): capture = cv2.VideoCapture(path) # frame size # height = int(capture.get(cv.CV_CAP_PROP_FRAME_HEIGHT)) # width = int(capture.get(cv.CV_CAP_PROP_FRAME_WIDTH)) # video "size" framePerSecond = capture.get(cv.CV_CAP_PROP_FPS) frameCount = int(capture.get(cv.CV_CAP_PROP_FRAME_COUNT)) # duration = frameCount / framePerSecond data = np.NaN * np.ones((frameCount, self.get_dimension())) while True: f = int(capture.get(cv.CV_CAP_PROP_POS_FRAMES)) success, frame = capture.read() if not success: break data[f, :] = self.process_frame(frame) duration = step = 1. / framePerSecond sliding_window = SlidingWindow(start=0., duration=duration, step=step) return SlidingWindowFeature(data, sliding_window)
def extract(self, wav): """Extract features Parameters ---------- wav : string Path to wav file. Returns ------- features : SlidingWindowFeature """ # hack data_flow, stack = self.get_flow_and_stack() engine = yaafelib.Engine() engine.load(data_flow) sample_rate, raw_audio = scipy.io.wavfile.read(wav) assert sample_rate == self.sample_rate, "sample rate mismatch" audio = np.array(raw_audio, dtype=np.float64, order='C').reshape(1, -1) features = engine.processAudio(audio) data = np.hstack([features[name] for name in stack]) sliding_window = YaafeFrame(blockSize=self.block_size, stepSize=self.step_size, sampleRate=self.sample_rate) return SlidingWindowFeature(data, sliding_window)
def __call__(self, wav): """Extract features Parameters ---------- wav : string Path to wav file. Returns ------- features : SlidingWindowFeature """ definition = self.definition() # --- prepare the feature plan feature_plan = yaafelib.FeaturePlan(sample_rate=self.sample_rate) for name, recipe in definition: assert feature_plan.addFeature( "{name}: {recipe}".format(name=name, recipe=recipe)) # --- prepare the Yaafe engine data_flow = feature_plan.getDataFlow() engine = yaafelib.Engine() engine.load(data_flow) sample_rate, raw_audio = scipy.io.wavfile.read(wav) assert sample_rate == self.sample_rate, "sample rate mismatch" audio = np.array(raw_audio, dtype=np.float64, order='C').reshape(1, -1) features = engine.processAudio(audio) data = np.hstack([features[name] for name, _ in definition]) sliding_window = YaafeFrame( blockSize=self.block_size, stepSize=self.step_size, sampleRate=self.sample_rate) return SlidingWindowFeature(data, sliding_window)
def __call__(self, item): """Extract features Parameters ---------- item : dict Returns ------- features : SlidingWindowFeature """ try: wav = item['wav'] y, sample_rate, encoding = pysndfile.sndio.read(wav) except IOError as e: raise PyannoteFeatureExtractionError(e.message) if np.any(np.isnan(y)): uri = get_unique_identifier(item) msg = 'pysndfile output contains NaNs for file "{uri}".' raise PyannoteFeatureExtractionError(msg.format(uri=uri)) # reshape before selecting channel if len(y.shape) < 2: y = y.reshape(-1, 1) channel = item.get('channel', 1) y = y[:, channel - 1] data = self.process(y, sample_rate) if np.any(np.isnan(data)): uri = get_unique_identifier(item) msg = 'Features extracted from "{uri}" contain NaNs.' warnings.warn(msg.format(uri=uri)) return SlidingWindowFeature(data.T, self.sliding_window_)
def __call__(self, item): """Extract features Parameters ---------- item : dict Returns ------- features : SlidingWindowFeature """ # --- load audio file y, sample_rate = read_audio(item, sample_rate=self.sample_rate, mono=True) # --- update data_flow every time sample rate changes if not hasattr(self, 'sample_rate_') or self.sample_rate_ != sample_rate: self.sample_rate_ = sample_rate feature_plan = yaafelib.FeaturePlan(sample_rate=self.sample_rate_) for name, recipe in self.definition(): assert feature_plan.addFeature("{name}: {recipe}".format( name=name, recipe=recipe)) data_flow = feature_plan.getDataFlow() self.engine_.load(data_flow) # Yaafe needs this: float64, column-contiguous, 2-dimensional y = np.array(y, dtype=np.float64, order='C').reshape((1, -1)) # --- extract features features = self.engine_.processAudio(y) data = np.hstack([features[name] for name, _ in self.definition()]) # --- stack features n_samples, n_features = data.shape zero_padding = self.stack // 2 if self.stack % 2 == 0: expanded_data = np.concatenate( (np.zeros((zero_padding, n_features)) + data[0], data, np.zeros((zero_padding - 1, n_features)) + data[-1])) else: expanded_data = np.concatenate( (np.zeros((zero_padding, n_features)) + data[0], data, np.zeros((zero_padding, n_features)) + data[-1])) data = np.lib.stride_tricks.as_strided(expanded_data, shape=(n_samples, n_features * self.stack), strides=data.strides) self.engine_.reset() # --- return as SlidingWindowFeature if np.any(np.isnan(data)): uri = get_unique_identifier(item) msg = 'Features extracted from "{uri}" contain NaNs.' warnings.warn(msg.format(uri=uri)) return SlidingWindowFeature(data, self.sliding_window_)
def post_process(self): # extract mfcc with yaafe and store them to be used with pyannote res_yaafe = self.parents['yaafe'].results['yaafe.mfccchop'] mfcc = res_yaafe.data_object.value sw = YaafeFrame(self.input_blocksize, self.input_stepsize, self.input_samplerate) pyannotefeat = SlidingWindowFeature(mfcc, sw) # gaussian divergence window size timestepsize = self.input_stepsize / float(self.input_samplerate) gdiff_win_size_frame = int(self.gdiff_win_size_sec / timestepsize) min_seg_size_frame = int(self.min_seg_size_sec / timestepsize) # speech activity detection sad_analyzer = self.parents['sad_analyzer'] res_sad = sad_analyzer.results['limsi_sad.sad_lhh_diff'] sadval = res_sad.data_object.value[:] # indices of frames detected as speech speech_threshold = 0. frameids = [ i for i, val in enumerate(sadval) if val > speech_threshold ] # compute gaussian divergence of speech frames only gdiff = gauss_div(mfcc[frameids, :], gdiff_win_size_frame) # initial segmentation based on gaussian divergence criterion seg = segment(gdiff, min_seg_size_frame) # Convert initial segmentation to pyannote annotation chunks = Annotation() fbegin = None lastframe = None ichunk = 0 for segval, iframe in zip(seg, frameids): if segval == 1: if lastframe is not None: chunks[pyannotefeat.sliding_window.rangeToSegment( fbegin, iframe - fbegin)] = str(ichunk) ichunk += 1 fbegin = iframe elif iframe - 1 != lastframe: if lastframe is not None: chunks[pyannotefeat.sliding_window.rangeToSegment( fbegin, lastframe - fbegin + 1)] = str(ichunk) fbegin = iframe lastframe = iframe if lastframe != fbegin: chunks[pyannotefeat.sliding_window.rangeToSegment( fbegin, lastframe - fbegin + 1)] = str(ichunk) # performs BIC clustering bicClustering = BICClustering(covariance_type='full', penalty_coef=self.bic_penalty_coeff) hypothesis = bicClustering(chunks, feature=pyannotefeat) # get diarisation results tmplabel = [int(h[2]) for h in hypothesis.itertracks(True)] tmptime = [h[0].start for h in hypothesis.itertracks()] tmpduration = [h[0].duration for h in hypothesis.itertracks()] # merge adjacent clusters having same labels label = [] time = [] duration = [] lastlabel = None for l, t, d in zip(tmplabel, tmptime, tmpduration): if l != lastlabel: label.append(l) duration.append(d) time.append(t) else: duration[-1] = t + d - time[-1] lastlabel = l # store diarisation result diar_res = self.new_result(data_mode='label', time_mode='segment') diar_res.id_metadata.id += '.' + 'speakers' # + name + 'diarisation' diar_res.id_metadata.name += ' ' + 'speaker identifiers' # name + 'diarisation' diar_res.data_object.label = label diar_res.data_object.time = time diar_res.data_object.duration = duration diar_res.data_object.label_metadata.label = dict() for lab in diar_res.data_object.label: diar_res.data_object.label_metadata.label[lab] = str(lab) self.add_result(diar_res)