def __init__(self, scores: Wrappable = None, purity: Optional[float] = 0.95, fscore: bool = False, diarization: bool = False): super().__init__() if scores is None: scores = "@scd_scores" self.scores = scores self._scores = Wrapper(self.scores) self.purity = purity self.fscore = fscore self.diarization = diarization # hyper-parameters self.alpha = Uniform(0., 1.) self.min_duration = Uniform(0., 10.)
def __init__( self, sad_scores: Union[Text, Path] = None, scd_scores: Union[Text, Path] = None, embedding: Union[Text, Path] = None, metric: Optional[str] = "cosine", method: Optional[str] = "pool", evaluation_only: Optional[bool] = False, purity: Optional[float] = None, ): super().__init__() self.sad_scores = sad_scores self.scd_scores = scd_scores if self.scd_scores == "oracle": if self.sad_scores == "oracle": self.speech_turn_segmentation = OracleSpeechTurnSegmentation() else: msg = ( f"Both sad_scores and scd_scores should be set to 'oracle' " f"for oracle speech turn segmentation, " f"got {self.sad_scores} and {self.scd_scores}, respectively." ) raise ValueError(msg) else: self.speech_turn_segmentation = SpeechTurnSegmentation( sad_scores=self.sad_scores, scd_scores=self.scd_scores) self.evaluation_only = evaluation_only self.purity = purity self.min_duration = Uniform(0, 10) self.embedding = embedding self.metric = metric self.method = method self.speech_turn_clustering = SpeechTurnClustering( embedding=self.embedding, metric=self.metric, method=self.method) self.speech_turn_assignment = SpeechTurnClosestAssignment( embedding=self.embedding, metric=self.metric)
def __init__(self, scores: Wrappable = None, fscore: bool = False): super().__init__() if scores is None: scores = "@sad_scores" self.scores = scores self._scores = Wrapper(self.scores) self.fscore = fscore # hyper-parameters self.onset = Uniform(0., 1.) self.offset = Uniform(0., 1.) self.min_duration_on = Uniform(0., 2.) self.min_duration_off = Uniform(0., 2.) self.pad_onset = Uniform(-1., 1.) self.pad_offset = Uniform(-1., 1.)
def __init__( self, sad: Union[Text, Path] = {"sad": { "duration": 2.0, "step": 0.1 }}, emb: Union[Text, Path] = "emb", batch_size: int = None, only_sad: bool = False, ): super().__init__() self.sad = Wrapper(sad) if batch_size is not None: self.sad.batch_size = batch_size self.sad_speech_index_ = self.sad.classes.index("speech") self.sad_threshold_on = Uniform(0.0, 1.0) self.sad_threshold_off = Uniform(0.0, 1.0) self.sad_min_duration_on = Uniform(0.0, 0.5) self.sad_min_duration_off = Uniform(0.0, 0.5) self.only_sad = only_sad if self.only_sad: return self.emb = Wrapper(emb) if batch_size is not None: self.emb.batch_size = batch_size max_duration = self.emb.duration min_duration = getattr(self.emb, "min_duration", 0.25 * max_duration) self.emb_duration = Uniform(min_duration, max_duration) self.emb_step_ratio = Uniform(0.1, 1.0) self.emb_threshold = Uniform(0.0, 2.0)
def __init__(self, scores: Wrappable = None, precision: float = 0.9, fscore: bool = False): super().__init__() if scores is None: scores = "@ovl_scores" self.scores = scores self._scores = Wrapper(self.scores) self.precision = precision self.fscore = fscore # hyper-parameters self.onset = Uniform(0.0, 1.0) self.offset = Uniform(0.0, 1.0) self.min_duration_on = Uniform(0.0, 2.0) self.min_duration_off = Uniform(0.0, 2.0) self.pad_onset = Uniform(-1.0, 1.0) self.pad_offset = Uniform(-1.0, 1.0)
def __init__(self, feature_extraction: Optional[dict] = None, architecture: Optional[dict] = None, overlap: Optional[bool] = False, keep_sad: Optional[bool] = False, mask: Optional[dict] = None, augmentation: Optional[bool] = False, duration: Optional[float] = 2.0, batch_size: Optional[float] = 32, gpu: Optional[bool] = False): # feature extraction if feature_extraction is None: from pyannote.audio.features import LibrosaMFCC self.feature_extraction_ = LibrosaMFCC( e=False, De=True, DDe=True, coefs=19, D=True, DD=True, duration=0.025, step=0.010, sample_rate=16000, ) else: FeatureExtraction = get_class_by_name( feature_extraction['name'], default_module_name='pyannote.audio.features') self.feature_extraction_ = FeatureExtraction( **feature_extraction.get('params', {}), augmentation=None) # network architecture if architecture is None: from pyannote.audio.models import PyanNet self.Architecture_ = PyanNet self.architecture_params_ = {'sincnet': {'skip': True}} else: self.Architecture_ = get_class_by_name( architecture['name'], default_module_name='pyannote.audio.models') self.architecture_params_ = architecture.get('params', {}) self.overlap = overlap self.keep_sad = keep_sad self.mask = mask if mask is None: self.mask_dimension_ = None self.mask_logscale_ = False else: self.mask_dimension_ = mask['dimension'] self.mask_logscale_ = mask['log_scale'] self.augmentation = augmentation self.duration = duration self.batch_size = batch_size self.gpu = gpu self.device_ = torch.device('cuda') if self.gpu else torch.device('cpu') # hyper-parameters self.learning_rate = LogUniform(1e-3, 1) self.epochs = Integer(10, 50) self.ensemble = Integer(1, 5) if self.overlap: self.overlap_threshold = Uniform(0, 1)