def test_overlay_cut_duration_and_supervisions(offset, allow_padding, expected_duration, exception_expectation, cut1, cut2): with exception_expectation: mixed_cut = cut1.mix(cut2, offset_other_by=offset, allow_padding=allow_padding) assert isinstance(mixed_cut, MixedCut) assert mixed_cut.duration == expected_duration assert mixed_cut.supervisions == [ SupervisionSegment(id="sup-1", recording_id="irrelevant", start=0.5, duration=6.0), SupervisionSegment(id="sup-2", recording_id="irrelevant", start=7.0, duration=2.0), SupervisionSegment(id="sup-3", recording_id="irrelevant", start=3.0 + offset, duration=2.5), ]
def search_supervision_set(): return SupervisionSet.from_segments([ SupervisionSegment(id='s1', recording_id='r1', start=0, duration=5.0, channel=0), SupervisionSegment(id='s2', recording_id='r1', start=4.5, duration=2.0, channel=1), SupervisionSegment(id='s3', recording_id='r1', start=8.0, duration=3.0, channel=0), SupervisionSegment(id='s4', recording_id='r2', start=1, duration=5.0, channel=0), ])
def overlapping_supervisions_cut(): return MonoCut( id="cut-1", start=0.0, duration=0.5, channel=0, features=Features( recording_id="recording-1", channels=0, start=0, duration=0.5, type="fbank", num_frames=50, num_features=80, frame_shift=0.01, sampling_rate=16000, storage_type="lilcom", storage_path="test/fixtures/dummy_feats/storage/", storage_key="e66b6386-aee5-4a5a-8369-fdde1d2b97c7.llc", ), supervisions=[ SupervisionSegment( id="s1", recording_id="recording-1", start=0.0, duration=0.2 ), SupervisionSegment( id="s2", recording_id="recording-1", start=0.1, duration=0.2 ), SupervisionSegment( id="s3", recording_id="recording-1", start=0.2, duration=0.2 ), SupervisionSegment( id="s4", recording_id="recording-1", start=0.3, duration=0.2 ), ], )
def overlapping_supervisions_cut(): return Cut( id='cut-1', start=0.0, duration=0.5, channel=0, features=Features( recording_id='recording-1', channels=0, start=0, duration=0.5, type='fbank', num_frames=50, num_features=80, frame_shift=0.01, sampling_rate=16000, storage_type='lilcom', storage_path='test/fixtures/dummy_feats/storage/', storage_key='e66b6386-aee5-4a5a-8369-fdde1d2b97c7.llc' ), supervisions=[ SupervisionSegment(id='s1', recording_id='recording-1', start=0.0, duration=0.2), SupervisionSegment(id='s2', recording_id='recording-1', start=0.1, duration=0.2), SupervisionSegment(id='s3', recording_id='recording-1', start=0.2, duration=0.2), SupervisionSegment(id='s4', recording_id='recording-1', start=0.3, duration=0.2) ] )
def dummy_supervision_set_lazy(): with NamedTemporaryFile(suffix=".jsonl.gz") as f: sups = SupervisionSet.from_segments( [ SupervisionSegment( id="sup1", recording_id="rec1", start=3, duration=4, channel=0, text="dummy text", ), SupervisionSegment( id="sup2", recording_id="rec1", start=7, duration=2, channel=0, text="dummy text", ), ] ) sups.to_file(f.name) f.flush() yield SupervisionSet.from_jsonl_lazy(f.name)
def test_supervision_set_iteration(): supervision_set = SupervisionSet( segments={ 'X': SupervisionSegment(id='X', recording_id='X', channel=0, start=2.0, duration=2.5), 'Y': SupervisionSegment(id='Y', recording_id='X', channel=0, start=5.0, duration=5.0), } ) assert 2 == len(supervision_set) assert 2 == len(list(supervision_set))
def test_append_cut_duration_and_supervisions(cut1, cut2): appended_cut = cut1.append(cut2) assert isinstance(appended_cut, MixedCut) assert appended_cut.duration == 20.0 assert appended_cut.supervisions == [ SupervisionSegment(id='sup-1', recording_id='irrelevant', start=0.5, duration=6.0), SupervisionSegment(id='sup-2', recording_id='irrelevant', start=7.0, duration=2.0), SupervisionSegment(id='sup-3', recording_id='irrelevant', start=13.0, duration=2.5) ]
def test_supervision_custom_attributes(): sup = SupervisionSegment(id="X", recording_id="X", start=0.0, duration=0.1) sup.eye_color = "green" sup.wer = 0.41 assert sup.eye_color == "green" assert sup.custom["eye_color"] == "green" assert sup.wer == 0.41 assert sup.custom["wer"] == 0.41 with pytest.raises(AttributeError): sup.nonexistent_attr
def test_known_issue_with_overlap(): r = dummy_recording(0) rec = RecordingSet.from_recordings([r]) # Make two segments. The first segment is 1s long. The segment segment # is 0.3 seconds long and lies entirely within the first. Both have the # same recording_id as the single entry in rec. sup = SupervisionSet.from_segments( [ SupervisionSegment( id="utt1", recording_id=r.id, start=0.0, duration=1.0, channel=0, text="Hello", ), SupervisionSegment( id="utt2", recording_id=r.id, start=0.2, duration=0.5, channel=0, text="World", ), ] ) cuts = CutSet.from_manifests(recordings=rec, supervisions=sup) assert len(cuts) == 1 cuts_trim = cuts.trim_to_supervisions(keep_overlapping=False) assert len(cuts_trim) == 2 cut = cuts_trim[0] assert cut.start == 0 assert cut.duration == 1 assert len(cut.supervisions) == 1 sup = cut.supervisions[0] assert sup.start == 0 assert sup.duration == 1 assert sup.text == "Hello" cut = cuts_trim[1] assert cut.start == 0.2 assert cut.duration == 0.5 assert len(cut.supervisions) == 1 sup = cut.supervisions[0] assert sup.start == 0 assert sup.duration == 0.5 assert sup.text == "World"
def cut1(dummy_features): return Cut(id='cut-1', start=0.0, duration=10.0, features=dummy_features, supervisions=[ SupervisionSegment(id='sup-1', recording_id='irrelevant', start=0.5, duration=6.0), SupervisionSegment(id='sup-2', recording_id='irrelevant', start=7.0, duration=2.0) ])
def prepare_supervision_other( audio: RecordingSet, annotations: Dict[str, List[AmiSegmentAnnotation]]) -> SupervisionSet: annotation_by_id = {(key[0]): annot for key, annot in annotations.items()} segments = [] for recording in audio: annotation = annotation_by_id.get(recording.id) # In these mic settings, all sources (1 for ihm-mix and sdm and 16 for mdm) # will share supervision. source = recording.sources[0] if annotation is None: logging.warning( f'No annotation found for recording {recording.id}') continue if (len(source.channels) > 1): logging.warning( f'More than 1 channels in recording {recording.id}. ' f'Creating supervision for channel 0 only.') for seg_idx, seg_info in enumerate(annotation): duration = seg_info.end_time - seg_info.begin_time if duration > 0: segments.append( SupervisionSegment(id=f'{recording.id}-{seg_idx}', recording_id=recording.id, start=seg_info.begin_time, duration=duration, channel=0, language='English', speaker=seg_info.speaker, gender=seg_info.gender, text=seg_info.text)) return SupervisionSet.from_segments(segments)
def parse_utterance( audio: Any, root_path: Path ) -> Optional[Tuple[Recording, List[SupervisionSegment]]]: sampling_rate = int(audio["sample_rate"]) recording = Recording( id=audio["aid"], sources=[ AudioSource( type="file", channels=list(range(int(audio["channels"]))), source=str(root_path / audio["path"]), ) ], num_samples=compute_num_samples( duration=Seconds(audio["duration"]), sampling_rate=sampling_rate ), sampling_rate=sampling_rate, duration=Seconds(audio["duration"]), ) segments = [] for seg in audio["segments"]: segments.append( SupervisionSegment( id=seg["sid"], recording_id=audio["aid"], start=Seconds(seg["begin_time"]), duration=round(Seconds(seg["end_time"] - seg["begin_time"]), ndigits=8), channel=0, language="English", speaker=seg["speaker"], text=seg["text_tn"], ) ) return recording, segments
def create_supervision( sessions_and_transcript_path: Tuple[Dict[str, Dict[str, str]], Pathlike] ) -> List[SupervisionSegment]: sessions, transcript_path = sessions_and_transcript_path transcript_path = Path(transcript_path) with codecs.open(transcript_path, "r", "utf8") as trans_f: lines = [l.rstrip("\n") for l in trans_f.readlines()][3:] lines = [l.split("\t") for l in lines if l.strip() != ""] lines = [ [ float(l[2]), float(l[3]), int(l[1]), " ".join([w for w in l[7].split() if w.strip() != ""]), ] for l in lines ] segments = [ SupervisionSegment( id=transcript_path.stem + "-" + str(k).zfill(len(str(len(lines)))), recording_id=transcript_path.stem, start=round(l[0], 10), duration=round(l[1] - l[0], 10), channel=l[2], text=l[3], language="Spanish", speaker=sessions[transcript_path.stem.split("_")[2]][l[2]], ) for k, l in enumerate(lines) ] return segments
def parse_utterance( audio: Any, root_path: Path ) -> Optional[Tuple[Recording, List[SupervisionSegment]]]: # Opus-format audio would be decoded at 48kHz by force, with the original sampling rate being ignored. opus_decoding_sample_rate = 48000 recording = Recording(id=audio['aid'], sources=[AudioSource(type='file', channels=list(range(int(audio['channels']))), source=f'{root_path}/{audio["path"]}')], num_samples=round(opus_decoding_sample_rate * Seconds(audio['duration']), ndigits=8), sampling_rate=opus_decoding_sample_rate, duration=Seconds(audio['duration'])).resample(int(audio['sample_rate'])) segments = [] for seg in audio['segments']: segments.append(SupervisionSegment(id=seg['sid'], recording_id=audio['aid'], start=Seconds(seg['begin_time']), duration=round(Seconds(seg['end_time'] - seg['begin_time']), ndigits=8), channel=0, language='English', speaker=seg['speaker'], text=seg['text_tn'])) return recording, segments
def parse_utterance( audio: Any, root_path: Path, subsets: Sequence ) -> Tuple[Recording, Dict[str, List[SupervisionSegment]]]: sampling_rate = 16000 recording = Recording( id=audio["aid"], sources=[ AudioSource( type="file", channels=[0], source=str(root_path / audio["path"]), ) ], num_samples=compute_num_samples(duration=audio["duration"], sampling_rate=sampling_rate), sampling_rate=sampling_rate, duration=audio["duration"], ) segments = defaultdict(dict) for sub in subsets: segments[sub] = [] for seg in audio["segments"]: segment = SupervisionSegment( id=seg["sid"], recording_id=audio["aid"], start=seg["begin_time"], duration=add_durations(seg["end_time"], -seg["begin_time"], sampling_rate), language="Chinese", text=seg["text"].strip(), ) for sub in seg["subsets"]: if sub in subsets: segments[sub].append(segment) return recording, segments
def dummy_supervision(unique_id: int, start: float = 0.0, duration: float = 1.0) -> SupervisionSegment: return SupervisionSegment(id=f'dummy-segment-{unique_id:04d}', recording_id='dummy-recording', start=start, duration=duration)
def parse_utterance( dataset_split_path: Path, line: str, alignments: Dict[str, List[AlignmentItem]], ) -> Optional[Tuple[Recording, SupervisionSegment]]: recording_id, text = line.strip().split(maxsplit=1) # Create the Recording first audio_path = (dataset_split_path / Path(recording_id.replace("-", "/")).parent / f"{recording_id}.flac") if not audio_path.is_file(): logging.warning(f"No such file: {audio_path}") return None recording = Recording.from_file(audio_path, recording_id=recording_id) # Then, create the corresponding supervisions segment = SupervisionSegment( id=recording_id, recording_id=recording_id, start=0.0, duration=recording.duration, channel=0, language="English", speaker=re.sub(r"-.*", r"", recording.id), text=text.strip(), alignment={"word": alignments[recording_id]} if recording_id in alignments else None, ) return recording, segment
def parse_utterance(row: Any, lang_path: Path, language: str) -> Tuple[Recording, SupervisionSegment]: # Create the Recording first audio_path = lang_path / "clips" / row.path if not audio_path.is_file(): raise ValueError(f"No such file: {audio_path}") recording_id = Path(row.path).stem recording = Recording.from_file(audio_path, recording_id=recording_id) # Then, create the corresponding supervisions segment = SupervisionSegment( id=recording_id, recording_id=recording_id, start=0.0, duration=recording.duration, channel=0, # Look up language code => language name mapping (it is empty at the time of writing this comment) # if the language code is unknown, fall back to using the language code. language=COMMONVOICE_CODE2LANG.get(language, language), speaker=row.client_id, text=row.sentence.strip(), gender=row.gender if row.gender != "nan" else None, custom={ "age": row.age if row.age != "nan" else None, "accent": row.accent if row.accent != "nan" else None, }, ) return recording, segment
def dummy_supervision_set(): return SupervisionSet.from_segments([ SupervisionSegment(id='sup1', recording_id='rec1', start=3, duration=4, channel=0, text='dummy text') ])
def from_dict(data: dict) -> 'Cut': feature_info = data.pop('features') supervision_infos = data.pop('supervisions') return Cut(**data, features=Features.from_dict(feature_info), supervisions=[ SupervisionSegment.from_dict(s) for s in supervision_infos ])
def dummy_supervision(unique_id: int, start: float = 0.0, duration: float = 1.0, text: str = "irrelevant") -> SupervisionSegment: return SupervisionSegment(id=f'dummy-segment-{unique_id:04d}', recording_id=f'dummy-recording-{unique_id:04d}', start=start, duration=duration, text=text)
def cut1(dummy_features, dummy_recording): return MonoCut( id="cut-1", start=0.0, duration=10.0, channel=0, features=dummy_features, recording=dummy_recording, supervisions=[ SupervisionSegment(id="sup-1", recording_id="irrelevant", start=0.5, duration=6.0), SupervisionSegment(id="sup-2", recording_id="irrelevant", start=7.0, duration=2.0), ], )
def test_create_supervision_segment_with_all_metadata(): SupervisionSegment(id='X', recording_id='X', start=0.0, duration=0.1, channel=0, text='wysokie szczyty', language='polish', speaker='Janusz', gender='male')
def cut2(dummy_features): return Cut(id='cut-2', start=180.0, duration=10.0, features=dummy_features, supervisions=[ SupervisionSegment(id='sup-3', recording_id='irrelevant', start=3.0, duration=2.5) ])
def dummy_supervision_set(): return SupervisionSet.from_segments([ SupervisionSegment( id="sup1", recording_id="rec1", start=3, duration=4, channel=0, text="dummy text", ) ])
def make_uem_segments(uem_path: Pathlike, recording: Recording) -> List[SupervisionSegment]: lines = uem_path.read_text().splitlines() return [ SupervisionSegment( id= f"{recording.id}-{int(100*float(start)):06d}-{int(100*float(end)):06d}", recording_id=recording.id, start=float(start), duration=round(float(end) - float(start), ndigits=8), ) for _, _, start, end in map(str.split, lines) ]
def prepare_ljspeech( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Generate a mapping: utt_id -> (audio_path, audio_info, text) metadata_csv_path = corpus_dir / "metadata.csv" assert metadata_csv_path.is_file(), f"No such file: {metadata_csv_path}" recordings = [] supervisions = [] with open(metadata_csv_path) as f: for line in f: recording_id, text, _ = line.split("|") audio_path = corpus_dir / "wavs" / f"{recording_id}.wav" if not audio_path.is_file(): logging.warning(f"No such file: {audio_path}") continue recording = Recording.from_file(audio_path) segment = SupervisionSegment( id=recording_id, recording_id=recording_id, start=0.0, duration=recording.duration, channel=0, language="English", gender="female", text=text, ) recordings.append(recording) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / "supervisions.json") recording_set.to_json(output_dir / "recordings.json") return {"recordings": recording_set, "supervisions": supervision_set}
def test_overlay_cut_duration_and_supervisions(offset, expected_duration, exception_expectation, cut1, cut2): with exception_expectation: mixed_cut = cut1.overlay(cut2, offset_other_by=offset) assert isinstance(mixed_cut, MixedCut) assert mixed_cut.duration == expected_duration assert mixed_cut.supervisions == [ SupervisionSegment(id='sup-1', recording_id='irrelevant', start=0.5, duration=6.0), SupervisionSegment(id='sup-2', recording_id='irrelevant', start=7.0, duration=2.0), SupervisionSegment(id='sup-3', recording_id='irrelevant', start=3.0 + offset, duration=2.5) ]
def dummy_supervision( unique_id: int, start: float = 0.0, duration: float = 1.0, text: str = "irrelevant", alignment: Optional[Dict[str, List[AlignmentItem]]] = dummy_alignment() ) -> SupervisionSegment: return SupervisionSegment(id=f'dummy-segment-{unique_id:04d}', recording_id=f'dummy-recording-{unique_id:04d}', start=start, duration=duration, text=text, alignment=alignment)
def make_corresponding_supervisions(audio: RecordingSet) -> SupervisionSet: """ Prepare a supervision set - in this case it just describes which segments are available in the corpus, as the actual supervisions for speech separation come from the source recordings. """ return SupervisionSet.from_segments( SupervisionSegment( id=f"{recording.id}-c{source.channels[0]}", recording_id=recording.id, start=0.0, duration=recording.duration, channel=source.channels[0], ) for recording in audio for source in recording.sources)