def test_create_corpus_basic(tmpdir, create_sine, make_wav): """Test that an attempt to create a Corpus object with a minimal data set""" from persephone.corpus import Corpus from pathlib import Path wav_dir = tmpdir.mkdir("wav") label_dir = tmpdir.mkdir("label") #create sine wave data data_a = create_sine(note="A") data_b = create_sine(note="B") data_c = create_sine(note="C") wav_test = wav_dir.join("test.wav") make_wav(data_a, str(wav_test)) wav_train = wav_dir.join("train.wav") make_wav(data_b, str(wav_train)) wav_valid = wav_dir.join("valid.wav") make_wav(data_c, str(wav_valid)) label_test = label_dir.join("test.phonemes").write("a") label_train = label_dir.join("train.phonemes").write("b") label_valid = label_dir.join("valid.phonemes").write("c") c = Corpus(feat_type='fbank', label_type='phonemes', tgt_dir=Path(str(tmpdir)), labels=None) assert c
def test_create_corpus_label_mismatch(tmpdir): """Test that creation of a Corpus raises an error when the supplied label set does not exactly match those found in the provided data""" from persephone.corpus import Corpus from persephone.exceptions import LabelMismatchException from pathlib import Path wav_dir = tmpdir.mkdir("wav") label_dir = tmpdir.mkdir("label") wav_test = wav_dir.join("test.wav").write("") wav_train = wav_dir.join("train.wav").write("") wav_valid = wav_dir.join("valid.wav").write("") label_test = label_dir.join("test.phonemes").write("a") label_train = label_dir.join("train.phonemes").write("b") label_valid = label_dir.join("valid.phonemes").write("c") # TODO: write prefix files with pytest.raises(LabelMismatchException): c = Corpus(feat_type='fbank', label_type='phonemes', tgt_dir=Path(str(tmpdir)), labels={"1", "2", "3"})
def prepared_data(request): data_path = Path("testing/data/bkw") exp_path = Path("testing/exp/19/") # TODO I shouldn't really be using "decoded" for the validation set dir # anymore. if request.param == "test": hyps_path = exp_path / "test" / "hyps" refs_path = exp_path / "test" / "refs" else: hyps_path = exp_path / "decoded" / "best_hyps" refs_path = exp_path / "decoded" / "refs" with hyps_path.open() as f: hyps = [hyp.split() for hyp in f.readlines()] with refs_path.open() as f: refs = [hyp.split() for hyp in f.readlines()] corp = Corpus.from_pickle(data_path) if request.param == "test": eval_prefixes = corp.test_prefixes else: eval_prefixes = corp.valid_prefixes return request.param, corp, eval_prefixes, hyps, refs
def _create_corpus(): from persephone.corpus import Corpus wav_dir = tmpdir.mkdir("wav") label_dir = tmpdir.mkdir("label") #create sine wave data data_a = create_note_sequence(notes=["A"]) data_b = create_note_sequence(notes=["B"]) data_c = create_note_sequence(notes=["C"]) data_a_b = create_note_sequence(notes=["A","B"]) data_b_c = create_note_sequence(notes=["B","C"]) data_a_b_c = create_note_sequence(notes=["A","B","C"]) #testing wav_test1 = wav_dir.join("test1.wav") make_wav(data_a_b, str(wav_test1)) label_test1 = label_dir.join("test1.phonemes").write("A B") wav_test2 = wav_dir.join("test2.wav") make_wav(data_c, str(wav_test2)) label_test2 = label_dir.join("test2.phonemes").write("C") #training wav_train1 = wav_dir.join("train1.wav") make_wav(data_b_c, str(wav_train1)) label_train1 = label_dir.join("train1.phonemes").write("B C") wav_train2 = wav_dir.join("train2.wav") make_wav(data_a_b_c, str(wav_train2)) label_train2 = label_dir.join("train2.phonemes").write("A B C") #validation wav_valid = wav_dir.join("valid.wav") make_wav(data_c, str(wav_valid)) label_valid = label_dir.join("valid.phonemes").write("C") # Prefixes handling test_prefixes = tmpdir.join("test_prefixes.txt").write("test1\ntest2") train_prefixes = tmpdir.join("train_prefixes.txt").write("train1\ntrain2") valid_prefixes = tmpdir.join("valid_prefixes.txt").write("valid") c = Corpus( feat_type='fbank', label_type='phonemes', tgt_dir=Path(str(tmpdir)), labels={"A","B","C"} ) assert c assert c.feat_type == 'fbank' assert c.label_type == 'phonemes' assert set(c.labels) == {"A", "B", "C"} assert c.vocab_size == 3 return c
def test_missing_experiment_dir(): """A Corpus needs an experiment directory, check an exception is thrown if the directory doesn't exist""" from pathlib import Path from persephone.corpus import Corpus with pytest.raises(FileNotFoundError): Corpus(feat_type='fbank', label_type='phonemes', tgt_dir=Path("thisDoesNotExist"), labels={"a", "b", "c"})
def test_missing_wav_dir(tmpdir): """Test that a missing wav dir raises an error""" from pathlib import Path from persephone.corpus import Corpus from persephone.exceptions import PersephoneException with pytest.raises(PersephoneException): Corpus(feat_type='fbank', label_type='phonemes', tgt_dir=Path(str(tmpdir)), labels={"a", "b", "c"})
def test_create_corpus_no_data(tmpdir): """Test that an attempt to create a Corpus object with no data raises an exception warning us that there's no data""" from persephone.corpus import Corpus from pathlib import Path wav_dir = tmpdir.mkdir("wav") label_dir = tmpdir.mkdir("label") from persephone.exceptions import PersephoneException with pytest.raises(PersephoneException): c = Corpus(feat_type='fbank', label_type='phonemes', tgt_dir=Path(str(tmpdir)), labels={"a", "b", "c"})
def test_corpus_with_predefined_data_sets(tmpdir, create_sine, make_wav): """Test that corpus construction works with prefix data splits determined as per the file system conventions. This will check that what is specified in : * `test_prefixes.txt` * `train_prefixes.txt` * `valid_prefixes.txt` Matches the internal members that store the prefix information """ from persephone.corpus import Corpus from pathlib import Path wav_dir = tmpdir.mkdir("wav") label_dir = tmpdir.mkdir("label") #create sine wave data data_a = create_sine(note="A") data_b = create_sine(note="B") data_c = create_sine(note="C") wav_test = wav_dir.join("test.wav") make_wav(data_a, str(wav_test)) wav_train = wav_dir.join("train.wav") make_wav(data_b, str(wav_train)) wav_valid = wav_dir.join("valid.wav") make_wav(data_c, str(wav_valid)) label_test = label_dir.join("test.phonemes").write("a") label_train = label_dir.join("train.phonemes").write("b") label_valid = label_dir.join("valid.phonemes").write("c") test_prefixes = tmpdir.join("test_prefixes.txt").write("test") train_prefixes = tmpdir.join("train_prefixes.txt").write("train") valid_prefixes = tmpdir.join("valid_prefixes.txt").write("valid") c = Corpus(feat_type='fbank', label_type='phonemes', tgt_dir=Path(str(tmpdir)), labels={"a", "b", "c"}) assert c assert c.feat_type == 'fbank' assert c.label_type == 'phonemes' assert set(c.labels) == {"a", "b", "c"} assert c.vocab_size == 3
def post(corpusInfo): """Create a DBcorpus""" INT64_MAX = 2 ^ 63 - 1 # Largest size that the 64bit integer value for the max_samples # can contain, this exists because the API will complain if a None # is returned, so we get much the same behavior by making the default # value the integer max value max_samples = corpusInfo.get('max_samples', INT64_MAX) current_corpus = DBcorpus(name=corpusInfo['name'], labelType=corpusInfo['labelType'], featureType=corpusInfo['featureType']) current_corpus.max_samples = max_samples db.session.add(current_corpus) db.session.flush( ) # Make sure that current_corpus.id exists before using as key training_set_IDs = corpusInfo['training'] for train_utterance_id in training_set_IDs: db.session.add( TrainingDataSet(corpus_id=current_corpus.id, utterance_id=train_utterance_id)) testing_set_IDs = corpusInfo['testing'] for test_utterance_id in testing_set_IDs: db.session.add( TestingDataSet(corpus_id=current_corpus.id, utterance_id=test_utterance_id)) validation_set_IDs = corpusInfo['validation'] for validation_utterance_id in validation_set_IDs: db.session.add( ValidationDataSet(corpus_id=current_corpus.id, utterance_id=validation_utterance_id)) #Saving Corpus as UUIDs to remove name collision issues corpus_uuid = uuid.uuid1() corpus_path = Path( flask.current_app.config['CORPUS_PATH']) / str(corpus_uuid) audio_uploads_path = Path(flask.current_app.config['UPLOADED_AUDIO_DEST']) transcription_uploads_path = Path( flask.current_app.config['UPLOADED_TEXT_DEST']) create_corpus_file_structure(audio_uploads_path, transcription_uploads_path, current_corpus, corpus_path) current_corpus.filesystem_path = str( corpus_uuid ) # see if there's some other way of handling a UUID value directly into SQLAlchemy db.session.add(current_corpus) # Creating the corpus object has the side-effect of creating a directory located at the path # given to `tgt_dir` persephone_corpus = Corpus( feat_type=current_corpus.featureType, label_type=current_corpus.labelType, tgt_dir=corpus_path, ) labels = persephone_corpus.labels # Make any labels that don't currently exist in the Label table for l in labels: current_label = Label(label=l) db.session.add(current_label) # Make CorpusLabelSet entry db.session.add( CorpusLabelSet(corpus=current_corpus, label=current_label)) try: db.session.commit() except sqlalchemy.exc.IntegrityError: return "Invalid corpus provided", 400 else: result = fix_corpus_format(CorpusSchema().dump(current_corpus).data) return result, 201